diff --git a/WORKSPACE b/WORKSPACE
index 17961829a60..0c7bc085b51 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -14,6 +14,33 @@ load("@io_bazel_rules_closure//closure:defs.bzl", "closure_repositories")
 
 closure_repositories()
 
+http_archive(
+    name = "base_images_docker",
+    sha256 = "e2b1b7254270bb7605e814a9dbf6d1e4ae04a11136ff1714fbfdabe3f87f7cf9",
+    strip_prefix = "base-images-docker-12801524f867e657fbb5d1a74f31618aff181ac6",
+    urls = ["https://github.com/GoogleCloudPlatform/base-images-docker/archive/12801524f867e657fbb5d1a74f31618aff181ac6.tar.gz"],
+)
+
+http_archive(
+    name = "bazel_toolchains",
+    sha256 = "15b5858b1b5541ec44df31b94c3b8672815b31d71215a98398761ea9f4c4eedb",
+    strip_prefix = "bazel-toolchains-6200b238c9c2d137c0d9a7262c80cc71d98e692b",
+    urls = [
+        "https://github.com/bazelbuild/bazel-toolchains/archive/6200b238c9c2d137c0d9a7262c80cc71d98e692b.tar.gz",
+    ],
+)
+
+http_archive(
+    name = "io_bazel_rules_docker",
+    sha256 = "29d109605e0d6f9c892584f07275b8c9260803bf0c6fcb7de2623b2bedc910bd",
+    strip_prefix = "rules_docker-0.5.1",
+    urls = ["https://github.com/bazelbuild/rules_docker/archive/v0.5.1.tar.gz"],
+)
+
+load("//third_party/toolchains/preconfig/generate:workspace.bzl", "remote_config_workspace")
+
+remote_config_workspace()
+
 # We must check the bazel version before trying to parse any other BUILD
 # files, in case the parsing of those build files depends on the bazel
 # version we require here.
@@ -79,3 +106,4 @@ new_http_archive(
         "http://download.tensorflow.org/models/speech_commands_v0.01.zip",
     ],
 )
+
diff --git a/configure.py b/configure.py
index 2eeeceb3399..234561d94a4 100644
--- a/configure.py
+++ b/configure.py
@@ -43,7 +43,7 @@ _DEFAULT_CUDA_PATH_WIN = ('C:/Program Files/NVIDIA GPU Computing '
 _TF_OPENCL_VERSION = '1.2'
 _DEFAULT_COMPUTECPP_TOOLKIT_PATH = '/usr/local/computecpp'
 _DEFAULT_TRISYCL_INCLUDE_DIR = '/usr/local/triSYCL/include'
-_SUPPORTED_ANDROID_NDK_VERSIONS = [10, 11, 12, 13, 14, 15, 16]
+_SUPPORTED_ANDROID_NDK_VERSIONS = [10, 11, 12, 13, 14, 15, 16, 17, 18]
 
 _DEFAULT_PROMPT_ASK_ATTEMPTS = 10
 
@@ -1555,6 +1555,9 @@ def main():
   check_bazel_version('0.15.0')
 
   reset_tf_configure_bazelrc()
+  # Explicitly import tools/bazel.rc, this is needed for Bazel 0.19.0 or later
+  write_to_bazelrc('import %workspace%/tools/bazel.rc')
+
   cleanup_makefile()
   setup_python(environ_cp)
 
diff --git a/tensorflow/BUILD b/tensorflow/BUILD
index 11b42f349df..859dc3b8d77 100644
--- a/tensorflow/BUILD
+++ b/tensorflow/BUILD
@@ -352,6 +352,7 @@ package_group(
         "//tensorflow/...",
         "//tensorflow_estimator/...",
         "//tensorflow_fold/llgtm/...",
+        "//tensorflow_text/...",
         "//third_party/py/tensor2tensor/...",
     ],
 )
diff --git a/tensorflow/c/BUILD b/tensorflow/c/BUILD
index 16f633643d4..b8db1b21449 100644
--- a/tensorflow/c/BUILD
+++ b/tensorflow/c/BUILD
@@ -95,6 +95,7 @@ tf_cuda_library(
             "//tensorflow/core:protos_all_cc",
             "//tensorflow/core:lib",
             "//tensorflow/core:lib_internal",
+            "//tensorflow/core/distributed_runtime:server_lib",
         ],
     }) + select({
         "//tensorflow:with_xla_support": [
@@ -199,7 +200,7 @@ tf_cuda_cc_test(
     size = "small",
     srcs = ["c_api_test.cc"],
     data = [
-        ":test_op.so",
+        ":test_op1.so",
         "//tensorflow/cc/saved_model:saved_model_half_plus_two",
     ],
     kernels = [":test_op_kernel"],
@@ -218,6 +219,7 @@ tf_cuda_cc_test(
         "//tensorflow/cc:grad_ops",
         "//tensorflow/cc/saved_model:signature_constants",
         "//tensorflow/cc/saved_model:tag_constants",
+        "//tensorflow/compiler/jit",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:direct_session",
         "//tensorflow/core:framework",
@@ -284,8 +286,8 @@ tf_cc_test(
 )
 
 tf_custom_op_library(
-    name = "test_op.so",
-    srcs = ["test_op.cc"],
+    name = "test_op1.so",
+    srcs = ["test_op1.cc"],
 )
 
 tf_kernel_library(
diff --git a/tensorflow/c/c_api.cc b/tensorflow/c/c_api.cc
index 4540dcd6638..f13e8777dff 100644
--- a/tensorflow/c/c_api.cc
+++ b/tensorflow/c/c_api.cc
@@ -2810,4 +2810,71 @@ TF_Buffer* TF_GetRegisteredKernelsForOp(const char* name, TF_Status* status) {
   }
   return ret;
 }
+
+// TF_Server functions ----------------------------------------------
+
+#ifndef __ANDROID__
+TF_Server::TF_Server(std::unique_ptr<tensorflow::ServerInterface> server)
+    : target(server->target()), server(std::move(server)) {}
+#endif  // __ANDROID__
+
+TF_Server* TF_NewServer(const void* proto, size_t proto_len,
+                        TF_Status* status) {
+#ifdef __ANDROID__
+  status->status = tensorflow::errors::Unimplemented(
+      "Server functionality is not supported in Android");
+  return nullptr;
+#else
+  tensorflow::ServerDef server_def;
+  if (!server_def.ParseFromArray(proto, static_cast<int>(proto_len))) {
+    status->status = InvalidArgument(
+        "Could not parse provided bytes into a ServerDef protocol buffer");
+    return nullptr;
+  }
+
+  std::unique_ptr<tensorflow::ServerInterface> out_server;
+  status->status = tensorflow::NewServer(server_def, &out_server);
+  if (!status->status.ok()) return nullptr;
+
+  return new TF_Server(std::move(out_server));
+#endif
+}
+
+void TF_ServerStart(TF_Server* server, TF_Status* status) {
+#ifdef __ANDROID__
+  status->status = tensorflow::errors::Unimplemented(
+      "Server functionality is not supported in Android");
+#else
+  status->status = server->server->Start();
+#endif
+}
+
+void TF_ServerStop(TF_Server* server, TF_Status* status) {
+#ifdef __ANDROID__
+  status->status = tensorflow::errors::Unimplemented(
+      "Server functionality is not supported in Android");
+#else
+  status->status = server->server->Stop();
+#endif
+}
+
+void TF_ServerJoin(TF_Server* server, TF_Status* status) {
+#ifdef __ANDROID__
+  status->status = tensorflow::errors::Unimplemented(
+      "Server functionality is not supported in Android");
+#else
+  status->status = server->server->Join();
+#endif
+}
+
+const char* TF_ServerTarget(TF_Server* server) {
+#ifdef __ANDROID__
+  return nullptr;
+#else
+  return server->target.c_str();
+#endif
+}
+
+void TF_DeleteServer(TF_Server* server) { delete server; }
+
 }  // end extern "C"
diff --git a/tensorflow/c/c_api.h b/tensorflow/c/c_api.h
index da8ad1cec59..3d56268110e 100644
--- a/tensorflow/c/c_api.h
+++ b/tensorflow/c/c_api.h
@@ -1668,6 +1668,47 @@ TF_CAPI_EXPORT extern TF_Buffer* TF_GetAllRegisteredKernels(TF_Status* status);
 TF_CAPI_EXPORT extern TF_Buffer* TF_GetRegisteredKernelsForOp(
     const char* name, TF_Status* status);
 
+// --------------------------------------------------------------------------
+// In-process TensorFlow server functionality, for use in distributed training.
+// A Server instance encapsulates a set of devices and a Session target that
+// can participate in distributed training. A server belongs to a cluster
+// (specified by a ClusterSpec), and corresponds to a particular task in a
+// named job. The server can communicate with any other server in the same
+// cluster.
+
+// In-process TensorFlow server.
+typedef struct TF_Server TF_Server;
+
+// Creates a new in-process TensorFlow server configured using a serialized
+// ServerDef protocol buffer provided via `proto` and `proto_len`.
+//
+// The server will not serve any requests until TF_ServerStart is invoked.
+// The server will stop serving requests once TF_ServerStop or
+// TF_DeleteServer is invoked.
+TF_CAPI_EXPORT extern TF_Server* TF_NewServer(const void* proto,
+                                              size_t proto_len,
+                                              TF_Status* status);
+
+// Starts an in-process TensorFlow server.
+TF_CAPI_EXPORT extern void TF_ServerStart(TF_Server* server, TF_Status* status);
+
+// Stops an in-process TensorFlow server.
+TF_CAPI_EXPORT extern void TF_ServerStop(TF_Server* server, TF_Status* status);
+
+// Blocks until the server has been successfully stopped (via TF_ServerStop or
+// TF_ServerClose).
+TF_CAPI_EXPORT extern void TF_ServerJoin(TF_Server* server, TF_Status* status);
+
+// Returns the target string that can be provided to TF_SetTarget() to connect
+// a TF_Session to `server`.
+//
+// The returned string is valid only until TF_DeleteServer is invoked.
+TF_CAPI_EXPORT extern const char* TF_ServerTarget(TF_Server* server);
+
+// Destroy an in-process TensorFlow server, frees memory. If server is running
+// it will be stopped and joined.
+TF_CAPI_EXPORT extern void TF_DeleteServer(TF_Server* server);
+
 #ifdef __cplusplus
 } /* end extern "C" */
 #endif
diff --git a/tensorflow/c/c_api_internal.h b/tensorflow/c/c_api_internal.h
index 95652a11378..5ba26d3c585 100644
--- a/tensorflow/c/c_api_internal.h
+++ b/tensorflow/c/c_api_internal.h
@@ -25,6 +25,7 @@ limitations under the License.
 #include <vector>
 
 #ifndef __ANDROID__
+#include "tensorflow/core/distributed_runtime/server_lib.h"
 #include "tensorflow/core/framework/op_gen_lib.h"
 #endif
 #include "tensorflow/core/common_runtime/shape_refiner.h"
@@ -179,6 +180,15 @@ struct TF_ApiDefMap {
   tensorflow::mutex lock;
 };
 
+#ifndef __ANDROID__
+struct TF_Server {
+  TF_Server(std::unique_ptr<tensorflow::ServerInterface> server);
+
+  const tensorflow::string target;
+  std::unique_ptr<tensorflow::ServerInterface> server;
+};
+#endif
+
 namespace tensorflow {
 
 class TensorCApi {
diff --git a/tensorflow/c/c_api_test.cc b/tensorflow/c/c_api_test.cc
index b0dc0363fdb..d5934a10395 100644
--- a/tensorflow/c/c_api_test.cc
+++ b/tensorflow/c/c_api_test.cc
@@ -187,15 +187,26 @@ TEST(CAPI, LibraryLoadFunctions) {
   // tf_cuda_cc_test() bazel rule and remove the next line.
   if (!GPUDeviceName().empty()) return;
 
-  // Load the library.
-  TF_Status* status = TF_NewStatus();
-  TF_Library* lib =
-      TF_LoadLibrary("tensorflow/c/test_op.so", status);
-  TF_Code code = TF_GetCode(status);
-  string status_msg(TF_Message(status));
-  TF_DeleteStatus(status);
-  ASSERT_EQ(TF_OK, code) << status_msg;
+#if !defined(TENSORFLOW_NO_SHARED_OBJECTS)
+  {
+    // Load the library.
+    TF_Status* status = TF_NewStatus();
+    TF_Library* lib =
+        TF_LoadLibrary("tensorflow/c/test_op1.so", status);
+    TF_Code code = TF_GetCode(status);
+    string status_msg(TF_Message(status));
+    TF_DeleteStatus(status);
+    ASSERT_EQ(TF_OK, code) << status_msg;
 
+    // Test op list.
+    TF_Buffer op_list_buf = TF_GetOpList(lib);
+    tensorflow::OpList op_list;
+    EXPECT_TRUE(op_list.ParseFromArray(op_list_buf.data, op_list_buf.length));
+    ASSERT_EQ(op_list.op_size(), 1);
+    EXPECT_EQ("TestCApi1", op_list.op(0).name());
+    TF_DeleteLibraryHandle(lib);
+  }
+#endif  // !defined(TENSORFLOW_NO_SHARED_OBJECTS)
   {
     TF_Buffer* op_list_buffer = TF_GetAllOpList();
     tensorflow::OpList op_list;
@@ -210,19 +221,6 @@ TEST(CAPI, LibraryLoadFunctions) {
     EXPECT_TRUE(found);
     TF_DeleteBuffer(op_list_buffer);
   }
-
-#if !defined(TENSORFLOW_NO_SHARED_OBJECTS)
-  {
-    // Test op list.
-    TF_Buffer op_list_buf = TF_GetOpList(lib);
-    tensorflow::OpList op_list;
-    EXPECT_TRUE(op_list.ParseFromArray(op_list_buf.data, op_list_buf.length));
-    ASSERT_EQ(op_list.op_size(), 1);
-    EXPECT_EQ("TestCApi", op_list.op(0).name());
-  }
-#endif  // !defined(TENSORFLOW_NO_SHARED_OBJECTS)
-
-  TF_DeleteLibraryHandle(lib);
 }
 
 void TestEncodeDecode(int line, const std::vector<string>& data) {
diff --git a/tensorflow/c/eager/BUILD b/tensorflow/c/eager/BUILD
index 3ee31a6a7ac..ba3d8533db7 100644
--- a/tensorflow/c/eager/BUILD
+++ b/tensorflow/c/eager/BUILD
@@ -69,7 +69,7 @@ tf_cuda_library(
     name = "c_api_internal",
     hdrs = ["c_api_internal.h"],
     visibility = [
-        "//learning/deepmind/courier:__pkg__",
+        "//learning/deepmind/courier:__subpackages__",
         "//tensorflow:internal",
     ],
     deps = [
diff --git a/tensorflow/c/eager/c_api.cc b/tensorflow/c/eager/c_api.cc
index 3554ec0bf32..408277468d7 100755
--- a/tensorflow/c/eager/c_api.cc
+++ b/tensorflow/c/eager/c_api.cc
@@ -404,8 +404,7 @@ const char* TFE_TensorHandleDeviceName(TFE_TensorHandle* h, TF_Status* status) {
         "The passed in handle is a nullptr");
     return nullptr;
   }
-  tensorflow::Device* d = nullptr;
-  status->status = h->handle->OpDevice(&d);
+  tensorflow::Device* d = h->handle->op_device();
   return (d == nullptr) ? "/job:localhost/replica:0/task:0/device:CPU:0"
                         : d->name().c_str();
 }
diff --git a/tensorflow/c/eager/c_api_debug.cc b/tensorflow/c/eager/c_api_debug.cc
index 5006b76f198..52b08245528 100644
--- a/tensorflow/c/eager/c_api_debug.cc
+++ b/tensorflow/c/eager/c_api_debug.cc
@@ -57,13 +57,9 @@ TF_CAPI_EXPORT extern TFE_TensorDebugInfo* TFE_TensorHandleTensorDebugInfo(
     return nullptr;
   }
 
-  tensorflow::Device* device;
-  status->status = handle->handle->Device(&device);
-  if (!status->status.ok()) {
-    return nullptr;
-  }
-
 #ifdef TENSORFLOW_EAGER_USE_XLA
+  tensorflow::Device* device = handle->handle->device();
+
   // If tensor resides on an XLA device, use XLA device's PaddedShapeFn.
   tensorflow::XlaDevice* xla_device =
       dynamic_cast<tensorflow::XlaDevice*>(device);
diff --git a/tensorflow/c/eager/c_api_internal.h b/tensorflow/c/eager/c_api_internal.h
index 104d52430cf..fa1b22e3af4 100644
--- a/tensorflow/c/eager/c_api_internal.h
+++ b/tensorflow/c/eager/c_api_internal.h
@@ -79,10 +79,6 @@ struct TFE_TensorHandle {
                    tensorflow::Device* op_device)
       : handle(new tensorflow::TensorHandle(t, d, op_device, nullptr)) {}
 
-  TFE_TensorHandle(tensorflow::uint64 node_id, tensorflow::DataType dtype,
-                   tensorflow::EagerContext* ctx)
-      : handle(new tensorflow::TensorHandle(node_id, dtype, ctx)) {}
-
   TFE_TensorHandle(tensorflow::TensorHandle* handle) : handle(handle) {}
 
   tensorflow::TensorHandle* handle;
diff --git a/tensorflow/c/test_op1.cc b/tensorflow/c/test_op1.cc
new file mode 100644
index 00000000000..b22cc9aef2b
--- /dev/null
+++ b/tensorflow/c/test_op1.cc
@@ -0,0 +1,23 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+
+namespace tensorflow {
+
+REGISTER_OP("TestCApi1").Doc(R"doc(Used to test C API)doc");
+
+}  // namespace tensorflow
diff --git a/tensorflow/cc/BUILD b/tensorflow/cc/BUILD
index c18b07603ae..83353b79f72 100644
--- a/tensorflow/cc/BUILD
+++ b/tensorflow/cc/BUILD
@@ -170,6 +170,7 @@ cc_library_with_android_deps(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -516,6 +517,8 @@ tf_gen_op_wrappers_cc(
         ":array_ops",
         ":const_op",
         ":math_ops",
+        "//tensorflow/cc:ops",
+        "//tensorflow/cc:scope",
     ],
 )
 
diff --git a/tensorflow/compiler/aot/BUILD b/tensorflow/compiler/aot/BUILD
index 6c29f09cde7..16151e77737 100644
--- a/tensorflow/compiler/aot/BUILD
+++ b/tensorflow/compiler/aot/BUILD
@@ -93,7 +93,7 @@ cc_library(
         ":tfcompile_lib",
         "//tensorflow/compiler/tf2xla:tf2xla_proto",
         "//tensorflow/compiler/tf2xla:tf2xla_util",
-        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
+        "//tensorflow/compiler/xla:debug_options_flags",
         "//tensorflow/compiler/xla/service:compiler",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:core_cpu_internal",
diff --git a/tensorflow/compiler/aot/tfcompile_main.cc b/tensorflow/compiler/aot/tfcompile_main.cc
index b95b063348c..d548de8c442 100644
--- a/tensorflow/compiler/aot/tfcompile_main.cc
+++ b/tensorflow/compiler/aot/tfcompile_main.cc
@@ -26,7 +26,7 @@ limitations under the License.
 #include "tensorflow/compiler/aot/flags.h"
 #include "tensorflow/compiler/tf2xla/tf2xla.pb.h"
 #include "tensorflow/compiler/tf2xla/tf2xla_util.h"
-#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
+#include "tensorflow/compiler/xla/debug_options_flags.h"
 #include "tensorflow/compiler/xla/service/compiler.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/graph.pb.h"
@@ -103,7 +103,7 @@ Status Main(const MainFlags& flags) {
     return errors::InvalidArgument("Must specify --cpp_class");
   }
   codegen_opts.gen_hlo_profile_printer_data =
-      xla::legacy_flags::GetDebugOptionsFromFlags().xla_hlo_profile();
+      xla::GetDebugOptionsFromFlags().xla_hlo_profile();
   TF_RETURN_IF_ERROR(ParseCppClass(flags.cpp_class, &codegen_opts.class_name,
                                    &codegen_opts.namespaces));
 
@@ -132,7 +132,7 @@ int main(int argc, char** argv) {
 
   std::vector<tensorflow::Flag> flag_list;
   AppendMainFlags(&flag_list, &flags);
-  xla::legacy_flags::AppendDebugOptionsFlags(&flag_list);
+  xla::AppendDebugOptionsFlags(&flag_list);
 
   tensorflow::string usage = tensorflow::tfcompile::kUsageHeader;
   usage += tensorflow::Flags::Usage(argv[0], flag_list);
diff --git a/tensorflow/compiler/jit/BUILD b/tensorflow/compiler/jit/BUILD
index 0c41e095c7b..5f25e4626ad 100644
--- a/tensorflow/compiler/jit/BUILD
+++ b/tensorflow/compiler/jit/BUILD
@@ -21,7 +21,6 @@ package(
 )
 
 load("//tensorflow:tensorflow.bzl", "cc_header_only_library")
-load("//tensorflow:tensorflow.bzl", "tf_kernel_library")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda_is_configured")
@@ -52,6 +51,7 @@ cc_library(
     deps = [
         ":jit_compilation_passes",
         "//tensorflow/compiler/jit/kernels:xla_ops",
+        "//tensorflow/compiler/tf2xla/kernels:xla_dummy_ops",
         "//tensorflow/compiler/tf2xla/kernels:xla_ops",
         "//tensorflow/compiler/xla/service:cpu_plugin",
     ],
@@ -65,6 +65,7 @@ cc_library(
         ":jit_compilation_passes",
         "//tensorflow/compiler/jit/kernels:xla_ops",
         "//tensorflow/compiler/tf2xla/kernels:xla_ops",
+        "//tensorflow/compiler/tf2xla/kernels:xla_dummy_ops",
         "//tensorflow/compiler/xla/service:gpu_plugin",
     ]),
     alwayslink = 1,
@@ -190,6 +191,7 @@ cc_library(
         "//tensorflow/core/kernels:resource_variable_ops",
         "//tensorflow/core/kernels:sendrecv_ops",
         "//tensorflow/core/kernels:shape_ops",
+        "//tensorflow/core/kernels:stack",
         "//tensorflow/core/kernels:variable_ops",
         "//tensorflow/core/kernels/data:generator_dataset_op",
         "//tensorflow/core/kernels/data:iterator_ops",
@@ -241,6 +243,7 @@ cc_library(
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/kernels:variable_ops",
         "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/memory",
     ],
 )
@@ -253,6 +256,7 @@ cc_library(
         "//tensorflow/compiler/tf2xla:common",
         "//tensorflow/compiler/tf2xla:dump_graph",
         "//tensorflow/compiler/tf2xla:xla_compiler",
+        "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla/client:client_library",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/core:core_cpu",
@@ -263,6 +267,21 @@ cc_library(
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/kernels:variable_ops",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+tf_cc_test(
+    name = "xla_compilation_cache_test",
+    srcs = [
+        "xla_compilation_cache_test.cc",
+    ],
+    deps = [
+        ":xla_compilation_cache",
+        "//tensorflow/compiler/tf2xla:common",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
     ],
 )
 
@@ -500,6 +519,7 @@ cc_library(
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:optional",
     ],
 )
 
@@ -524,25 +544,6 @@ cc_library(
     hdrs = ["union_find.h"],
 )
 
-cc_library(
-    name = "producer_consumer_queue",
-    hdrs = ["producer_consumer_queue.h"],
-    deps = ["//tensorflow/core:lib"],
-)
-
-tf_cc_test(
-    name = "producer_consumer_queue_test",
-    size = "small",
-    srcs = ["producer_consumer_queue_test.cc"],
-    deps = [
-        ":producer_consumer_queue",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
-    ],
-)
-
 tf_cc_test(
     name = "deadness_analysis_test",
     size = "small",
@@ -606,6 +607,7 @@ tf_cc_test(
         "//tensorflow/compiler/tf2xla:xla_compiler",
         "//tensorflow/compiler/tf2xla/cc:xla_jit_ops",
         "//tensorflow/compiler/tf2xla/cc:xla_ops",
+        "//tensorflow/compiler/tf2xla/kernels:xla_dummy_ops",
         "//tensorflow/compiler/tf2xla/kernels:xla_ops",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
@@ -648,31 +650,6 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
-    name = "xla_launch_util_test",
-    size = "small",
-    srcs = ["xla_launch_util_test.cc"],
-    deps = [
-        ":common",
-        ":xla_compilation_cache",
-        ":xla_launch_util",
-        ":xla_tensor",
-        "//tensorflow/compiler/tf2xla:common",
-        "//tensorflow/compiler/tf2xla:xla_compiler",
-        "//tensorflow/compiler/xla:statusor",
-        "//tensorflow/compiler/xla/client:client_library",
-        "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/core:core_cpu_internal",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:gpu_runtime",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:test",
-        "//tensorflow/core/kernels:variable_ops",
-    ],
-)
-
 cc_library(
     name = "xla_fusion_optimizer",
     srcs = ["xla_fusion_optimizer.cc"],
diff --git a/tensorflow/compiler/jit/build_xla_ops_pass.cc b/tensorflow/compiler/jit/build_xla_ops_pass.cc
index 054f31ba335..93637a69d5d 100644
--- a/tensorflow/compiler/jit/build_xla_ops_pass.cc
+++ b/tensorflow/compiler/jit/build_xla_ops_pass.cc
@@ -214,7 +214,8 @@ Status NodeRequiresCompilation(Node* n, bool* result) {
     return errors::Internal("Could not find compilation device ",
                             device_type.type());
   }
-  *result = registration->requires_compilation;
+  *result = registration->autoclustering_policy ==
+            XlaOpRegistry::AutoclusteringPolicy::kAlways;
   return Status::OK();
 }
 
diff --git a/tensorflow/compiler/jit/deadness_analysis_test.cc b/tensorflow/compiler/jit/deadness_analysis_test.cc
index 617e31488c7..8a73101c184 100644
--- a/tensorflow/compiler/jit/deadness_analysis_test.cc
+++ b/tensorflow/compiler/jit/deadness_analysis_test.cc
@@ -127,7 +127,8 @@ InductionVarInfo CreateInductionVariable(const Scope& root,
   Output loop_cond =
       ops::LoopCond(root.WithOpName(prefix + "/cond"), loop_cond_expr);
   ops::Switch latch(root.WithOpName(prefix + "/latch"), iv.output, loop_cond);
-  ops::internal::Exit exit(root.WithOpName(prefix + "/exit"), iv.output);
+  ops::internal::Exit exit(root.WithOpName(prefix + "/exit"),
+                           latch.output_false);
   Output iv_next = ops::Add(root.WithOpName(prefix + "/ivnext"),
                             latch.output_true, increment_by);
   Output next_iteration =
@@ -191,7 +192,8 @@ DependentInductionVar CreateDependentLoopInvariantValue(
                                             value, frame_name);
   ops::Merge iv(root.WithOpName(prefix + "/iv"), {enter_value, enter_value});
   ops::Switch latch(root.WithOpName(prefix + "/latch"), iv.output, loop_cond);
-  ops::internal::Exit exit(root.WithOpName(prefix + "/exit"), iv.output);
+  ops::internal::Exit exit(root.WithOpName(prefix + "/exit"),
+                           latch.output_false);
   Output next_iteration = ops::NextIteration(
       root.WithOpName(prefix + "/next_iteration"), latch.output_true);
   CHECK(root.graph()
diff --git a/tensorflow/compiler/jit/encapsulate_util.h b/tensorflow/compiler/jit/encapsulate_util.h
index a3b193eea74..5e0c4bf6a0c 100644
--- a/tensorflow/compiler/jit/encapsulate_util.h
+++ b/tensorflow/compiler/jit/encapsulate_util.h
@@ -117,6 +117,25 @@ Status PreprocessForEncapsulation(Graph* g,
 
 // Information for XLA computation.
 struct XlaClusterInfo {
+  // Add an explicitly-defined default constructor for this class.
+  //
+  // The compiler may delete the default constructor here because
+  // host_compute_core is a const member whose type (std::map) doesn't
+  // necessarily have a user provided constructor -- while libc++ and
+  // libstdc++ 4.8 provide a user defined default constructor, libstdc++ at
+  // least >= 7.3 does not. See also c++11 [class.ctor] p5.
+  //
+  // TODO(klimek): In c++17 we'll be able to initialize host_compute_core
+  // without losing aggregate initialization, which allows us to get rid of
+  // the constructor definitions again.
+  XlaClusterInfo() {}
+  XlaClusterInfo(const string& cluster_name,
+                 const NameAttrList& func_name_attrs, Node* node,
+                 const std::map<string, int>& host_compute_core)
+      : cluster_name(cluster_name),
+        func_name_attrs(func_name_attrs),
+        node(node),
+        host_compute_core(host_compute_core) {}
   // XLA cluster name. It might be different from `func_name`.
   const string cluster_name;
   // Name and attributes of XLA computation function.
diff --git a/tensorflow/compiler/jit/extract_outside_compilation_pass.cc b/tensorflow/compiler/jit/extract_outside_compilation_pass.cc
index 70b019d35fc..8b3587c5087 100644
--- a/tensorflow/compiler/jit/extract_outside_compilation_pass.cc
+++ b/tensorflow/compiler/jit/extract_outside_compilation_pass.cc
@@ -394,12 +394,12 @@ Status ConstructHostGraph(
   for (const string& host_func : outside_compilation_host_graphs) {
     VLOG(4) << "Expanding host graph " << host_func;
     FunctionBody* host_fbody = nullptr;
-    TF_RETURN_IF_ERROR(
-        FunctionDefToBodyHelper(*fld->Find(host_func), AttrSlice(), fld,
-                                [&](const string& op, const OpDef** sig) {
-                                  return fld->LookUpOpDef(op, sig);
-                                },
-                                &host_fbody));
+    TF_RETURN_IF_ERROR(FunctionDefToBodyHelper(
+        *fld->Find(host_func), AttrSlice(), fld,
+        [&](const string& op, const OpDef** sig) {
+          return fld->LookUpOpDef(op, sig);
+        },
+        &host_fbody));
     std::unique_ptr<FunctionBody> host_fbody_deleter(host_fbody);
 
     // We use ReverseDFS() to copy nodes. Make sure all nodes are reverse
@@ -411,52 +411,53 @@ Status ConstructHostGraph(
     node_map[host_fbody->graph->source_node()] = (*host_graph)->source_node();
     node_map[host_fbody->graph->sink_node()] = (*host_graph)->sink_node();
     Status s;
-    ReverseDFS(*host_fbody->graph, /*enter=*/nullptr,
-               [&](const Node* n) {
-                 if (!s.ok()) {
-                   return;
-                 }
+    ReverseDFS(
+        *host_fbody->graph, /*enter=*/nullptr,
+        [&](const Node* n) {
+          if (!s.ok()) {
+            return;
+          }
 
-                 Node* copy;
-                 if (node_map.find(n) != node_map.end()) {
-                   // Already copied this node.
-                   copy = node_map.at(n);
-                 } else if (IsKeyPlaceholderNode(*n)) {
-                   // Change a).
-                   copy = key_placeholder;
-                   node_map[n] = copy;
-                 } else {
-                   // Copy the node.
-                   NodeDef copy_def = n->def();
-                   // Change c).
-                   copy_def.clear_device();
-                   copy = (*host_graph)->AddNode(copy_def, &s);
-                   if (!s.ok()) {
-                     return;
-                   }
-                   node_map[n] = copy;
-                 }
+          Node* copy;
+          if (node_map.find(n) != node_map.end()) {
+            // Already copied this node.
+            copy = node_map.at(n);
+          } else if (IsKeyPlaceholderNode(*n)) {
+            // Change a).
+            copy = key_placeholder;
+            node_map[n] = copy;
+          } else {
+            // Copy the node.
+            NodeDef copy_def = n->def();
+            // Change c).
+            copy_def.clear_device();
+            copy = (*host_graph)->AddNode(copy_def, &s);
+            if (!s.ok()) {
+              return;
+            }
+            node_map[n] = copy;
+          }
 
-                 // Only handle input edges. Output edges will be added later as
-                 // its output nodes' input edges.
-                 for (auto e : n->in_edges()) {
-                   if (node_map.find(e->src()) == node_map.end()) {
-                     s = errors::Internal("Cannot find node image for ",
-                                          e->src()->DebugString());
-                     return;
-                   }
-                   (*host_graph)
-                       ->AddEdge(node_map[e->src()], e->src_output(), copy,
-                                 e->dst_input());
-                 }
+          // Only handle input edges. Output edges will be added later as
+          // its output nodes' input edges.
+          for (auto e : n->in_edges()) {
+            if (node_map.find(e->src()) == node_map.end()) {
+              s = errors::Internal("Cannot find node image for ",
+                                   e->src()->DebugString());
+              return;
+            }
+            (*host_graph)
+                ->AddEdge(node_map[e->src()], e->src_output(), copy,
+                          e->dst_input());
+          }
 
-                 // Change b).
-                 if (copy->type_string() == "_XlaRecvAtHost" ||
-                     copy->type_string() == "_XlaSendFromHost") {
-                   (*host_graph)->AddControlEdge(copy, sequencer);
-                 }
-               },
-               NodeComparatorID());
+          // Change b).
+          if (copy->type_string() == "_XlaRecvAtHost" ||
+              copy->type_string() == "_XlaSendFromHost") {
+            (*host_graph)->AddControlEdge(copy, sequencer);
+          }
+        },
+        NodeComparatorID());
     if (!s.ok()) {
       return s;
     }
@@ -838,7 +839,12 @@ Status ExtractOutsideCompilationForFunction(
         FunctionDef shape_inference_fdef = *xla_fdef;
         shape_inference_fdef.mutable_signature()->set_name(
             shape_inference_graph);
-        TF_RETURN_IF_ERROR(fld->AddFunctionDef(shape_inference_fdef));
+        if (fld->Find(shape_inference_graph)) {
+          TF_RETURN_IF_ERROR(fld->ReplaceFunction(shape_inference_graph,
+                                                  shape_inference_fdef));
+        } else {
+          TF_RETURN_IF_ERROR(fld->AddFunctionDef(shape_inference_fdef));
+        }
       }
     }
   }
diff --git a/tensorflow/compiler/jit/increase_dynamism_for_auto_jit_pass.cc b/tensorflow/compiler/jit/increase_dynamism_for_auto_jit_pass.cc
index bd8719b7f1a..d984ca15cb7 100644
--- a/tensorflow/compiler/jit/increase_dynamism_for_auto_jit_pass.cc
+++ b/tensorflow/compiler/jit/increase_dynamism_for_auto_jit_pass.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "absl/container/inlined_vector.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_replace.h"
+#include "absl/types/optional.h"
 #include "tensorflow/cc/framework/scope_internal.h"
 #include "tensorflow/cc/ops/array_ops.h"
 #include "tensorflow/cc/ops/const_op.h"
@@ -34,14 +35,30 @@ limitations under the License.
 
 namespace tensorflow {
 namespace {
-Status GetTensorFromConstOp(Node* n, Tensor* out_tensor) {
-  TF_RET_CHECK(n->type_string() == "Const");
+
+// StatusOrOptional<T> instances hold
+//
+//  - A non-OK Status to indicate an error that needs to be propagated out of
+//    this pass (e.g. the Graph is malformed).
+//
+//  - A nullopt to indicate the function that created the instance failed to do
+//    what it set out to do but this is not actually an error
+//    (e.g. TryToGetTensorFromConstOp was passed a non-Const node).
+//
+//  - A T to indicate a successful operation.
+template <class T>
+using StatusOrOptional = xla::StatusOr<absl::optional<T>>;
+
+StatusOrOptional<Tensor> TryToGetTensorFromConstOp(Node* n) {
+  if (n->type_string() != "Const") {
+    return {absl::nullopt};
+  }
+
   const TensorProto* proto = nullptr;
   TF_RETURN_IF_ERROR(GetNodeAttr(n->def(), "value", &proto));
   Tensor tensor(proto->dtype());
   TF_RET_CHECK(tensor.FromProto(*proto));
-  *out_tensor = std::move(tensor);
-  return Status::OK();
+  return {tensor};
 }
 
 struct SliceInputs {
@@ -70,7 +87,7 @@ std::vector<int64> IntTensorAsVector(const Tensor& t) {
 
 // Packages up the inputs to a Slice operation into an instance of
 // `SliceInputs`.
-Status GetSliceInputs(Node* slice, SliceInputs* slice_inputs) {
+StatusOrOptional<SliceInputs> GetSliceInputs(Node* slice) {
   const int kSliceInputIndex = 0;
   const int kSliceBeginIndex = 1;
   const int kSliceSizeIndex = 2;
@@ -81,23 +98,27 @@ Status GetSliceInputs(Node* slice, SliceInputs* slice_inputs) {
   TF_RETURN_IF_ERROR(slice->input_edge(kSliceSizeIndex, &slice_size_edge));
   const Edge* slice_begin_edge;
   TF_RETURN_IF_ERROR(slice->input_edge(kSliceBeginIndex, &slice_begin_edge));
-  slice_inputs->input =
+
+  SliceInputs slice_inputs;
+  slice_inputs.input =
       Output(slice_input_edge->src(), slice_input_edge->src_output());
-  slice_inputs->begin =
+  slice_inputs.begin =
       Output(slice_begin_edge->src(), slice_begin_edge->src_output());
-  slice_inputs->size =
+  slice_inputs.size =
       Output(slice_size_edge->src(), slice_size_edge->src_output());
 
-  Tensor tf_slice_size;
-  TF_RETURN_IF_ERROR(
-      GetTensorFromConstOp(slice_inputs->size.node(), &tf_slice_size));
-
-  if (tf_slice_size.dims() != 1) {
-    return errors::Internal("Expected vector for the slice size input.");
+  TF_ASSIGN_OR_RETURN(absl::optional<Tensor> tf_slice_size,
+                      TryToGetTensorFromConstOp(slice_inputs.size.node()));
+  if (!tf_slice_size.has_value()) {
+    return {absl::nullopt};
   }
 
-  slice_inputs->size_as_vector = IntTensorAsVector(tf_slice_size);
-  return Status::OK();
+  if (tf_slice_size->dims() != 1) {
+    return {absl::nullopt};
+  }
+
+  slice_inputs.size_as_vector = IntTensorAsVector(*tf_slice_size);
+  return {slice_inputs};
 }
 
 // Casts `x` to a DT_INT64 if it isn't one already.
@@ -263,36 +284,43 @@ Status RewriteSlice(Graph* g, Node* slice, const SliceInputs& slice_inputs,
   return Status::OK();
 }
 
-// Returns true if `n` is a slice we can rewrite to have a static shape
-// (i.e. have the output shape only depend on the "size" input).  Fills in
-// `slice_inputs` in the process.
-bool IsRewritableSlice(Node* n, SliceInputs* slice_inputs) {
+// If `n` is a slice we can rewrite to have a static shape (i.e. have the output
+// shape only depend on the "size" input) then returns the a SliceInputs
+// representing the inputs to `n`.  Otherwise returns nullopt.
+StatusOrOptional<SliceInputs> IsRewritableSlice(Node* n) {
   if (n->type_string() != "Slice") {
-    return false;
+    return {absl::nullopt};
   }
 
   if (!GetXlaClusterForNode(*n).has_value()) {
     // There is no need to change slice ops outside XLA clusters.
-    return false;
+    return {absl::nullopt};
   }
 
-  if (!GetSliceInputs(n, slice_inputs).ok()) {
-    // Could not parse slice inputs.  E.g. the sizes input was not a constant.
-    return false;
+  TF_ASSIGN_OR_RETURN(absl::optional<SliceInputs> slice_inputs,
+                      GetSliceInputs(n));
+  if (!slice_inputs.has_value()) {
+    return {absl::nullopt};
   }
 
   // If slice_size[i] < -1 for any i then executing the slice will throw an
   // error, and we don't do anything here.
-  return absl::c_all_of(slice_inputs->size_as_vector,
-                        [](int64 size_i) { return size_i >= -1; });
+  bool slice_is_ok = absl::c_all_of(slice_inputs->size_as_vector,
+                                    [](int64 size_i) { return size_i >= -1; });
+  if (!slice_is_ok) {
+    return {absl::nullopt};
+  }
+
+  return slice_inputs;
 }
 
 Status FindAndRewriteSlices(Graph* g, bool* changed) {
   std::vector<std::pair<Node*, SliceInputs>> slices_to_rewrite;
   for (Node* n : g->nodes()) {
-    SliceInputs slice_inputs;
-    if (IsRewritableSlice(n, &slice_inputs)) {
-      slices_to_rewrite.push_back({n, std::move(slice_inputs)});
+    TF_ASSIGN_OR_RETURN(absl::optional<SliceInputs> slice_inputs,
+                        IsRewritableSlice(n));
+    if (slice_inputs.has_value()) {
+      slices_to_rewrite.push_back({n, std::move(*slice_inputs)});
     }
   }
 
diff --git a/tensorflow/compiler/jit/jit_compilation_pass_registration.cc b/tensorflow/compiler/jit/jit_compilation_pass_registration.cc
index 107d521077c..f79bdc1e2e8 100644
--- a/tensorflow/compiler/jit/jit_compilation_pass_registration.cc
+++ b/tensorflow/compiler/jit/jit_compilation_pass_registration.cc
@@ -44,11 +44,8 @@ REGISTER_OPTIMIZATION(OptimizationPassRegistry::PRE_PLACEMENT, 26,
 REGISTER_OPTIMIZATION(OptimizationPassRegistry::POST_REWRITE_FOR_EXEC, 10,
                       MarkForCompilationPass);
 
-// TODO(b/111210515): IncreaseDynamismForAutoJitPass creates slices with index
-// type DT_INT64 which do not have a kernel on GPU.
-//
-// REGISTER_OPTIMIZATION(OptimizationPassRegistry::POST_REWRITE_FOR_EXEC, 20,
-//                       IncreaseDynamismForAutoJitPass);
+REGISTER_OPTIMIZATION(OptimizationPassRegistry::POST_REWRITE_FOR_EXEC, 20,
+                      IncreaseDynamismForAutoJitPass);
 
 REGISTER_OPTIMIZATION(OptimizationPassRegistry::POST_REWRITE_FOR_EXEC, 30,
                       PartiallyDeclusterPass);
diff --git a/tensorflow/compiler/jit/kernels/xla_ops.cc b/tensorflow/compiler/jit/kernels/xla_ops.cc
index 6bcae1dcc3d..055de7afcc5 100644
--- a/tensorflow/compiler/jit/kernels/xla_ops.cc
+++ b/tensorflow/compiler/jit/kernels/xla_ops.cc
@@ -39,12 +39,22 @@ limitations under the License.
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 #include "tensorflow/core/util/stream_executor_util.h"
 
+// OP_REQUIRES_OK_RETURN is the same as OP_REQUIRES_OK except that
+// in error case, it returns RET instead of void.
+#define OP_REQUIRES_OK_RETURN(CTX, RET, ...)                \
+  do {                                                      \
+    ::tensorflow::Status _s(__VA_ARGS__);                   \
+    if (!TF_PREDICT_TRUE(_s.ok())) {                        \
+      (CTX)->CtxFailureWithWarning(__FILE__, __LINE__, _s); \
+      return RET;                                           \
+    }                                                       \
+  } while (0)
+
 namespace tensorflow {
 
 namespace {
 
-Status PlatformInfoFromContext(OpKernelConstruction* ctx,
-                               XlaPlatformInfo* result) {
+XlaPlatformInfo PlatformInfoFromContext(OpKernelConstruction* ctx) {
   DeviceType device_type = ctx->device_type();
   se::Platform::Id platform_id = nullptr;
   const XlaDevice::Metadata* xla_device_metadata = nullptr;
@@ -76,16 +86,16 @@ Status PlatformInfoFromContext(OpKernelConstruction* ctx,
   }
 
   if (!device_allocator) {
-    TF_ASSIGN_OR_RETURN(se::Platform* const platform,
-                        se::MultiPlatformManager::PlatformWithId(platform_id));
+    xla::StatusOr<se::Platform*> maybe_platform =
+        se::MultiPlatformManager::PlatformWithId(platform_id);
+    OP_REQUIRES_OK_RETURN(ctx, XlaPlatformInfo(), maybe_platform.status());
+
     xla_allocator = absl::make_unique<XlaAllocator>(
-        platform, ctx->device()->GetAllocator({}));
+        maybe_platform.ValueOrDie(), ctx->device()->GetAllocator({}));
   }
 
-  *result = XlaPlatformInfo(device_type, platform_id, xla_device_metadata,
-                            std::move(xla_allocator), device_allocator);
-
-  return Status::OK();
+  return XlaPlatformInfo(device_type, platform_id, xla_device_metadata,
+                         std::move(xla_allocator), device_allocator);
 }
 
 // A closure describing how to run a compiled version of a TensorFlow function.
@@ -179,9 +189,8 @@ XlaLocalLaunchBase::XlaLocalLaunchBase(OpKernelConstruction* ctx,
     : OpKernel(ctx),
       constants_(constants),
       resources_(resources),
-      function_(function) {
-  OP_REQUIRES_OK(ctx, PlatformInfoFromContext(ctx, &platform_info_));
-}
+      function_(function),
+      platform_info_(PlatformInfoFromContext(ctx)) {}
 
 static Status BuildCompilationCache(OpKernelContext* ctx,
                                     const XlaPlatformInfo& platform_info,
@@ -277,8 +286,10 @@ static Status CompileToLocalExecutable(
   // rather than a one-element tuple.
   compile_options.always_return_tuple = false;
 
-  return cache->Compile(options, function, constant_args, *variables, ctx,
-                        compile_options,
+  std::vector<XlaCompiler::Argument> args;
+  TF_RETURN_IF_ERROR(XlaComputationLaunchContext::BuildXlaCompilerArguments(
+      constant_args, *variables, ctx, &args));
+  return cache->Compile(options, function, args, compile_options,
                         lazy ? XlaCompilationCache::CompileMode::kLazy
                              : XlaCompilationCache::CompileMode::kStrict,
                         kernel, executable);
@@ -333,18 +344,6 @@ void XlaLocalLaunchBase::Compute(OpKernelContext* ctx) {
 }
 
 namespace {
-
-// OP_REQUIRES_OK_RETURN is the same as OP_REQUIRES_OK except that
-// in error case, it returns RET instead of void.
-#define OP_REQUIRES_OK_RETURN(CTX, RET, ...)                \
-  do {                                                      \
-    ::tensorflow::Status _s(__VA_ARGS__);                   \
-    if (!TF_PREDICT_TRUE(_s.ok())) {                        \
-      (CTX)->CtxFailureWithWarning(__FILE__, __LINE__, _s); \
-      return RET;                                           \
-    }                                                       \
-  } while (0)
-
 // Helper static functions to construct parameters for
 // XlaLocalLaunchBase constructor from OpKernelConstruction.
 std::vector<int> ConstantsVector(OpKernelConstruction* ctx) {
@@ -381,7 +380,12 @@ NameAttrList FunctionAttr(OpKernelConstruction* ctx) {
   return *func;
 }
 
-#undef OP_REQUIRES_OK_RETURN
+bool MustCompileAttr(OpKernelConstruction* ctx) {
+  bool must_compile;
+  OP_REQUIRES_OK_RETURN(ctx, false,
+                        ctx->GetAttr("must_compile", &must_compile));
+  return must_compile;
+}
 }  // namespace
 
 XlaLocalLaunchOp::XlaLocalLaunchOp(OpKernelConstruction* ctx)
@@ -396,10 +400,9 @@ XlaCompileOp::XlaCompileOp(OpKernelConstruction* ctx)
     : OpKernel(ctx),
       constants_(ConstantsVector(ctx)),
       resources_(ResourcesVector(ctx)),
-      function_(FunctionAttr(ctx)) {
-  OP_REQUIRES_OK(ctx, PlatformInfoFromContext(ctx, &platform_info_));
-  OP_REQUIRES_OK(ctx, ctx->GetAttr("must_compile", &must_compile_));
-}
+      function_(FunctionAttr(ctx)),
+      platform_info_(PlatformInfoFromContext(ctx)),
+      must_compile_(MustCompileAttr(ctx)) {}
 
 void XlaCompileOp::Compute(OpKernelContext* ctx) {
   VLOG(3) << "XlaCompileOp " << def().name()
@@ -409,13 +412,30 @@ void XlaCompileOp::Compute(OpKernelContext* ctx) {
   xla::LocalExecutable* executable;
   std::map<int, OptionalTensor> variables;
 
-  if (legacy_flags::GetXlaOpsCommonFlags().tf_xla_always_defer_compilation) {
+  bool cannot_compile_cluster;
+  {
+    mutex_lock guard(cannot_compile_cluster_mu_);
+    cannot_compile_cluster = cannot_compile_cluster_;
+  }
+
+  if (legacy_flags::GetXlaOpsCommonFlags().tf_xla_always_defer_compilation ||
+      cannot_compile_cluster) {
     executable = nullptr;
   } else {
-    OP_REQUIRES_OK(ctx, CompileToLocalExecutable(
-                            ctx, function_, platform_info_, resources_,
-                            constants_, /*lazy=*/!must_compile_, &client,
-                            &variables, &kernel, &executable));
+    Status status = CompileToLocalExecutable(
+        ctx, function_, platform_info_, resources_, constants_,
+        /*lazy=*/!must_compile_, &client, &variables, &kernel, &executable);
+    if (must_compile_ || status.code() != error::UNIMPLEMENTED) {
+      OP_REQUIRES_OK(ctx, status);
+    }
+
+    if (status.code() == error::UNIMPLEMENTED) {
+      LOG(WARNING) << "Compilation failed:" << status.ToString()
+                   << ".  Falling back to TF function call.";
+      executable = nullptr;
+      mutex_lock guard(cannot_compile_cluster_mu_);
+      cannot_compile_cluster_ = true;
+    }
   }
 
   AllocatorAttributes host_alloc_attrs;
@@ -452,9 +472,8 @@ void XlaCompileOp::Compute(OpKernelContext* ctx) {
   ctx->set_output(1, compilation_successful);
 }
 
-XlaRunOp::XlaRunOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
-  OP_REQUIRES_OK(ctx, PlatformInfoFromContext(ctx, &platform_info_));
-}
+XlaRunOp::XlaRunOp(OpKernelConstruction* ctx)
+    : OpKernel(ctx), platform_info_(PlatformInfoFromContext(ctx)) {}
 
 void XlaRunOp::Compute(OpKernelContext* ctx) {
   VLOG(3) << "XlaRunOp " << def().name();
diff --git a/tensorflow/compiler/jit/kernels/xla_ops.h b/tensorflow/compiler/jit/kernels/xla_ops.h
index ac90837e0d9..7b4d4b5b473 100644
--- a/tensorflow/compiler/jit/kernels/xla_ops.h
+++ b/tensorflow/compiler/jit/kernels/xla_ops.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_JIT_KERNELS_XLA_OPS_H_
 #define TENSORFLOW_COMPILER_JIT_KERNELS_XLA_OPS_H_
 
+#include <atomic>
+
 #include "tensorflow/compiler/jit/xla_compilation_cache.h"
 #include "tensorflow/compiler/jit/xla_device.h"
 #include "tensorflow/compiler/jit/xla_launch_util.h"
@@ -33,6 +35,7 @@ namespace tensorflow {
 class XlaPlatformInfo {
  public:
   XlaPlatformInfo() : device_type_("") {}
+  XlaPlatformInfo(XlaPlatformInfo&&) = default;
   explicit XlaPlatformInfo(const DeviceType device_type,
                            se::Platform::Id platform_id,
                            const XlaDevice::Metadata* xla_device_metadata,
@@ -110,12 +113,12 @@ class XlaLocalLaunchBase : public OpKernel {
 
  protected:
   // Indexes of compile-time constant inputs
-  std::vector<int> constants_;
+  const std::vector<int> constants_;
   // Indexes of resource inputs
-  std::vector<int> resources_;
+  const std::vector<int> resources_;
 
-  NameAttrList function_;
-  XlaPlatformInfo platform_info_;
+  const NameAttrList function_;
+  const XlaPlatformInfo platform_info_;
 };
 
 // XlaLocalLaunchOp is used to replace a region of the TensorFlow graph
@@ -144,15 +147,23 @@ class XlaCompileOp : public OpKernel {
 
  private:
   // Indexes of compile-time constant inputs
-  std::vector<int> constants_;
+  const std::vector<int> constants_;
   // Indexes of resource inputs
-  std::vector<int> resources_;
+  const std::vector<int> resources_;
 
-  NameAttrList function_;
+  const NameAttrList function_;
 
   XlaPlatformInfo platform_info_;
 
-  bool must_compile_;
+  const bool must_compile_;
+
+  // cannot_compile_cluster_ is set to true if XLA returns an Unimplemented
+  // error when compiling the cluster this _XlaCompile is supposed to compile.
+  // If `cannot_compile_cluster_` is true then we avoid compiling this cluster
+  // on any future calls to _XlaCompile.
+  bool cannot_compile_cluster_ GUARDED_BY(cannot_compile_cluster_mu_) = false;
+
+  mutex cannot_compile_cluster_mu_;
 };
 
 class XlaRunOp : public OpKernel {
@@ -162,7 +173,7 @@ class XlaRunOp : public OpKernel {
   void Compute(OpKernelContext* ctx) override;
 
  private:
-  XlaPlatformInfo platform_info_;
+  const XlaPlatformInfo platform_info_;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/legacy_flags/BUILD b/tensorflow/compiler/jit/legacy_flags/BUILD
index 49ff9a3ddd1..5fa6c85f06f 100644
--- a/tensorflow/compiler/jit/legacy_flags/BUILD
+++ b/tensorflow/compiler/jit/legacy_flags/BUILD
@@ -22,7 +22,7 @@ cc_library(
     hdrs = ["mark_for_compilation_pass_flags.h"],
     deps =
         [
-            "//tensorflow/compiler/xla/legacy_flags:parse_flags_from_env",
+            "//tensorflow/compiler/xla:parse_flags_from_env",
             "//tensorflow/core:framework_internal",
             "//tensorflow/core:lib",
         ],
@@ -34,7 +34,7 @@ cc_library(
     hdrs = ["xla_device_flags.h"],
     deps =
         [
-            "//tensorflow/compiler/xla/legacy_flags:parse_flags_from_env",
+            "//tensorflow/compiler/xla:parse_flags_from_env",
             "//tensorflow/core:framework_internal",
             "//tensorflow/core:lib",
         ],
@@ -46,7 +46,7 @@ cc_library(
     hdrs = ["build_xla_ops_pass_flags.h"],
     deps =
         [
-            "//tensorflow/compiler/xla/legacy_flags:parse_flags_from_env",
+            "//tensorflow/compiler/xla:parse_flags_from_env",
             "//tensorflow/core:framework_internal",
             "//tensorflow/core:lib",
         ],
@@ -58,7 +58,7 @@ cc_library(
     hdrs = ["xla_ops_common_flags.h"],
     deps =
         [
-            "//tensorflow/compiler/xla/legacy_flags:parse_flags_from_env",
+            "//tensorflow/compiler/xla:parse_flags_from_env",
             "//tensorflow/core:framework_internal",
             "//tensorflow/core:lib",
         ],
diff --git a/tensorflow/compiler/jit/legacy_flags/build_xla_ops_pass_flags.cc b/tensorflow/compiler/jit/legacy_flags/build_xla_ops_pass_flags.cc
index 73f4dc73ed8..961c17c17ea 100644
--- a/tensorflow/compiler/jit/legacy_flags/build_xla_ops_pass_flags.cc
+++ b/tensorflow/compiler/jit/legacy_flags/build_xla_ops_pass_flags.cc
@@ -16,7 +16,7 @@ limitations under the License.
 #include <mutex>  // NOLINT
 
 #include "tensorflow/compiler/jit/legacy_flags/build_xla_ops_pass_flags.h"
-#include "tensorflow/compiler/xla/legacy_flags/parse_flags_from_env.h"
+#include "tensorflow/compiler/xla/parse_flags_from_env.h"
 #include "tensorflow/core/util/command_line_flags.h"
 
 namespace tensorflow {
@@ -34,7 +34,7 @@ void AllocateAndParseFlags() {
       Flag("tf_xla_enable_lazy_compilation",
            &flags->tf_xla_enable_lazy_compilation, ""),
   });
-  xla::legacy_flags::ParseFlagsFromEnv(*flag_list);
+  xla::ParseFlagsFromEnv(*flag_list);
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/jit/legacy_flags/mark_for_compilation_pass_flags.cc b/tensorflow/compiler/jit/legacy_flags/mark_for_compilation_pass_flags.cc
index 7277a1d1f8a..bad306e0b0a 100644
--- a/tensorflow/compiler/jit/legacy_flags/mark_for_compilation_pass_flags.cc
+++ b/tensorflow/compiler/jit/legacy_flags/mark_for_compilation_pass_flags.cc
@@ -19,7 +19,8 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/compiler/jit/legacy_flags/mark_for_compilation_pass_flags.h"
-#include "tensorflow/compiler/xla/legacy_flags/parse_flags_from_env.h"
+#include "tensorflow/compiler/xla/parse_flags_from_env.h"
+#include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/command_line_flags.h"
 
@@ -64,7 +65,18 @@ static void AllocateFlags() {
        Flag("tf_xla_fusion_only", &flags->tf_xla_fusion_only,
             "enable fusion of element-wise operations only using XLA when "
             "global_jit_level is ON*.")});
-  xla::legacy_flags::ParseFlagsFromEnv(*flag_list);
+  xla::ParseFlagsFromEnv(*flag_list);
+
+  if (VLOG_IS_ON(1)) {
+    VLOG(1) << "Parsed MarkForCompilationPassFlags:";
+    VLOG(1) << "  tf_xla_auto_jit = " << flags->tf_xla_auto_jit;
+    VLOG(1) << "  tf_xla_min_cluster_size = " << flags->tf_xla_min_cluster_size;
+    VLOG(1) << "  tf_xla_max_cluster_size = " << flags->tf_xla_max_cluster_size;
+    VLOG(1) << "  tf_xla_clustering_debug = " << flags->tf_xla_clustering_debug;
+    VLOG(1) << "  tf_xla_cpu_global_jit = " << flags->tf_xla_cpu_global_jit;
+    VLOG(1) << "  tf_xla_clustering_fuel = " << flags->tf_xla_clustering_fuel;
+    VLOG(1) << "  tf_xla_fusion_only = " << flags->tf_xla_fusion_only;
+  }
 }
 
 // Append to *append_to flag definitions associated with the XLA bridge's
diff --git a/tensorflow/compiler/jit/legacy_flags/mark_for_compilation_pass_flags.h b/tensorflow/compiler/jit/legacy_flags/mark_for_compilation_pass_flags.h
index 2affda6ab4e..79b47357a17 100644
--- a/tensorflow/compiler/jit/legacy_flags/mark_for_compilation_pass_flags.h
+++ b/tensorflow/compiler/jit/legacy_flags/mark_for_compilation_pass_flags.h
@@ -33,7 +33,7 @@ void AppendMarkForCompilationPassFlags(
 
 // The values of flags associated with the XLA bridge's
 // mark_for_compilation_pass module.
-typedef struct {
+struct MarkForCompilationPassFlags {
   int32 tf_xla_auto_jit;  // Control compilation of operators into XLA
                           // computations on CPU and GPU devices.  0 = use
                           // ConfigProto setting; -1 = off; 1 = on for things
@@ -55,7 +55,7 @@ typedef struct {
                             // is set to ON* and overrides its behavior. If
                             // true, enable fusion of element-wise operations
                             // only using XLA.
-} MarkForCompilationPassFlags;
+};
 
 // Return a pointer to the MarkForCompilationPassFlags struct;
 // repeated calls return the same pointer.
diff --git a/tensorflow/compiler/jit/legacy_flags/xla_device_flags.cc b/tensorflow/compiler/jit/legacy_flags/xla_device_flags.cc
index 1bb2fce2dba..76b80d3034c 100644
--- a/tensorflow/compiler/jit/legacy_flags/xla_device_flags.cc
+++ b/tensorflow/compiler/jit/legacy_flags/xla_device_flags.cc
@@ -19,7 +19,7 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/compiler/jit/legacy_flags/xla_device_flags.h"
-#include "tensorflow/compiler/xla/legacy_flags/parse_flags_from_env.h"
+#include "tensorflow/compiler/xla/parse_flags_from_env.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/command_line_flags.h"
 
@@ -41,7 +41,7 @@ static void AllocateFlags() {
            "Switch a device into 'on-demand' mode, where instead of "
            "autoclustering ops are compiled one by one just-in-time."),
   });
-  xla::legacy_flags::ParseFlagsFromEnv(*flag_list);
+  xla::ParseFlagsFromEnv(*flag_list);
 }
 
 // Return a pointer to the XlaDeviceFlags struct;
diff --git a/tensorflow/compiler/jit/legacy_flags/xla_ops_common_flags.cc b/tensorflow/compiler/jit/legacy_flags/xla_ops_common_flags.cc
index ae17fdffb9b..1443d48a734 100644
--- a/tensorflow/compiler/jit/legacy_flags/xla_ops_common_flags.cc
+++ b/tensorflow/compiler/jit/legacy_flags/xla_ops_common_flags.cc
@@ -17,8 +17,8 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/compiler/jit/legacy_flags/xla_ops_common_flags.h"
-#include "tensorflow/compiler/xla/legacy_flags/parse_flags_from_env.h"
-
+#include "tensorflow/compiler/xla/parse_flags_from_env.h"
+#include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/util/command_line_flags.h"
 
 namespace tensorflow {
@@ -35,7 +35,13 @@ void AllocateAndParseFlags() {
       Flag("tf_xla_always_defer_compilation",
            &flags->tf_xla_always_defer_compilation, ""),
   });
-  xla::legacy_flags::ParseFlagsFromEnv(*flag_list);
+  xla::ParseFlagsFromEnv(*flag_list);
+
+  if (VLOG_IS_ON(1)) {
+    VLOG(1) << "Parsed XlaOpsCommonFlags:";
+    VLOG(1) << "  tf_xla_always_defer_compilation = "
+            << flags->tf_xla_always_defer_compilation;
+  }
 }
 
 const XlaOpsCommonFlags& GetXlaOpsCommonFlags() {
diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass.cc b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
index 11975a6bb07..70033cae0af 100644
--- a/tensorflow/compiler/jit/mark_for_compilation_pass.cc
+++ b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
@@ -61,8 +61,23 @@ struct OperationFilter {
   // seeding behavior as TensorFlow's RNG (b/34749654).  So we avoid
   // auto-clustering stateful RNG ops.
   bool allow_stateful_rng_ops;
+
+  // TODO(b/118970344): Whether ControlTrigger ops are allowed.  It is unsound
+  // to cluster ControlTrigger because of how we use deadness analysis.
+  bool allow_control_trigger;
+
+  // Whether ops with dummy implementations are allowed. We avoid
+  // auto-clustering these ops so that the user is not surprised when XLA is
+  // implicitly enabled. If the user explicitly specifies to use XLA, it is fine
+  // to resort to a dummy implementation. Currently Assert and CheckNumerics ops
+  // have dummy XLA implementations.
+  bool allow_dummy_ops;
 };
 
+bool IsDummyImplOp(absl::string_view op_name) {
+  return op_name == "Assert" || op_name == "CheckNumerics";
+}
+
 bool IsStatefulRandomOp(absl::string_view op_name) {
   return op_name == "RandomUniform" || op_name == "RandomShuffle" ||
          op_name == "RandomUniformInt" || op_name == "RandomStandardNormal" ||
@@ -225,6 +240,12 @@ bool IsCompilableCall(const NodeDef& call_def,
         IsStatefulRandomOp(node->type_string())) {
       return false;
     }
+    if (!op_filter.allow_control_trigger && node->IsControlTrigger()) {
+      return false;
+    }
+    if (!op_filter.allow_dummy_ops && IsDummyImplOp(node->type_string())) {
+      return false;
+    }
     if (!HasXLAKernel(*node, jit_device_type) &&
         !IsCompilableCall(node->def(), jit_device_type, op_filter, depth + 1,
                           lib_runtime)) {
@@ -452,7 +473,14 @@ Status FindCompilationCandidates(
 
     OperationFilter op_filter;
     op_filter.allow_resource_ops = registration->compile_resource_ops;
-    op_filter.allow_stateful_rng_ops = registration->requires_compilation;
+    op_filter.allow_stateful_rng_ops =
+        (registration->autoclustering_policy ==
+         XlaOpRegistry::AutoclusteringPolicy::kAlways);
+    op_filter.allow_control_trigger =
+        (registration->autoclustering_policy ==
+         XlaOpRegistry::AutoclusteringPolicy::kAlways);
+    op_filter.allow_dummy_ops = (registration->autoclustering_policy ==
+                                 XlaOpRegistry::AutoclusteringPolicy::kAlways);
 
     if (!HasXLAKernel(*node, jit_device_type) &&
         !IsCompilableCall(node->def(), jit_device_type, op_filter, 0,
@@ -467,6 +495,15 @@ Status FindCompilationCandidates(
       VLOG(2) << "Rejecting " << node->name() << ": stateful random operation";
       continue;
     }
+    if (!op_filter.allow_control_trigger && node->IsControlTrigger()) {
+      VLOG(2) << "Rejecting " << node->name() << ": is a control trigger op";
+      continue;
+    }
+    if (!op_filter.allow_dummy_ops && IsDummyImplOp(node->type_string())) {
+      VLOG(2) << "Rejecting " << node->name() << ": dummy op ("
+              << node->type_string() << ")";
+      continue;
+    }
 
     if (!op_filter.allow_resource_ops &&
         (HasResourceOutput(*node) || IsNonResourceVarResourceOp(*node))) {
@@ -597,11 +634,14 @@ bool IsCompilable(FunctionLibraryRuntime* flr, const NodeDef& ndef) {
                                             &registration));
   DeviceType jit_device_type(registration->compilation_device_name);
 
-  // We can always *compile* resource operations and stateful RNGs, even if we
-  // are sometimes unable to auto-cluster them.
+  // We can always *compile* resource operations, stateful RNGs and dummy ops,
+  // even if we are sometimes unable to auto-cluster them.
   OperationFilter op_filter;
   op_filter.allow_resource_ops = true;
   op_filter.allow_stateful_rng_ops = true;
+  op_filter.allow_control_trigger = true;
+  op_filter.allow_dummy_ops = true;
+
   return IsCompilableCall(ndef, jit_device_type, op_filter, 0, flr);
 }
 
@@ -613,10 +653,8 @@ Status MarkForCompilationPass::Run(
       GetGlobalJitLevel(options);
   legacy_flags::MarkForCompilationPassFlags* flags =
       legacy_flags::GetMarkForCompilationPassFlags();
-  bool cpu_global_jit = flags->tf_xla_cpu_global_jit;
   bool fusion_only = flags->tf_xla_fusion_only;
 
-  VLOG(1) << "flags->tf_xla_cpu_global_jit = " << flags->tf_xla_cpu_global_jit;
   VLOG(1) << "flags->tf_xla_fusion_only = " << flags->tf_xla_fusion_only;
   VLOG(1) << "flags->tf_xla_auto_jit = " << flags->tf_xla_auto_jit;
   const FunctionLibraryDefinition* fld = options.flib_def;
@@ -635,9 +673,6 @@ Status MarkForCompilationPass::Run(
       return false;
     }
 
-    // If this device requires a JIT, we must say yes.
-    if (registration->requires_compilation) return true;
-
     // If there is a _XlaCompile annotation, use its value.
     bool compile = false;
     Status status = GetNodeAttr(node->attrs(), kXlaCompileAttr, &compile);
@@ -674,18 +709,21 @@ Status MarkForCompilationPass::Run(
       return false;
     }
 
-    // Otherwise use the value of global_jit_level.
-    // Ignore enable_jit_by_default if global jit compilation for CPU
-    // is explicitly requested via tf_xla_cpu_global_jit flag
-    bool ignore_registration = cpu_global_jit && device_type == DEVICE_CPU;
+    // Otherwise use the value of global_jit_level and the device's
+    // autoclustering policy.
     bool should_compile =
-        (ignore_registration || registration->enable_jit_by_default) &&
-        global_jit_level != OptimizerOptions::OFF;
+        registration->autoclustering_policy ==
+            XlaOpRegistry::AutoclusteringPolicy::kAlways ||
+        (registration->autoclustering_policy ==
+             XlaOpRegistry::AutoclusteringPolicy::kIfEnabledGlobally &&
+         global_jit_level != OptimizerOptions::OFF);
     if (!should_compile) {
       if (global_jit_level == OptimizerOptions::OFF) {
         VLOG(2) << "Rejecting " << node->name() << ": global jit disabled.";
       } else {
-        VLOG(2) << "Rejecting " << node->name() << ": JIT for device disabled.";
+        VLOG(2)
+            << "Rejecting " << node->name()
+            << ": autoclustering for device only when requested explicitly.";
       }
     }
     return should_compile;
@@ -1073,12 +1111,10 @@ Status MarkForCompilationPass::RunImpl(
     XlaOpRegistry::GetCompilationDevice(device_type.type(), &registration);
 
     // Compile if this is a cluster of >= min_cluster_size compilable operators.
-    // Also, always compile if the operator is placed on a device that requires
-    // compilation, or if it contains at least one op that is marked for
+    // Also, always compile if it contains at least one op that is marked for
     // compilation that is not an Identity op.
     if (effective_cluster_sizes[cluster] >= min_cluster_size ||
-        (effective_cluster_sizes[cluster] > 0 && marked_for_compilation) ||
-        registration->requires_compilation) {
+        (effective_cluster_sizes[cluster] > 0 && marked_for_compilation)) {
       string& name = cluster_names[cluster];
 
       if (name.empty()) {
diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc b/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc
index ead1cf4fd5f..24d78c07726 100644
--- a/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc
+++ b/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc
@@ -817,14 +817,10 @@ TEST(XlaCompilationTest, ClusterControlTrigger) {
 
   std::unordered_map<string, string> clusters = GetClusters(*graph);
 
-  ASSERT_FALSE(clusters.empty());
-  string cluster_name = clusters.begin()->second;
-
-  // ctrl_trigger_a has inputs with mismatching deadness so it won't be
-  // clustered.  ctrl_trigger_b is okay to cluster.
-  std::unordered_map<string, string> expected_clusters(
-      {{"const_a", cluster_name}, {"ctrl_trigger_b", cluster_name}});
-  EXPECT_EQ(clusters, expected_clusters);
+  // TODO(b/118970344): ctrl_trigger_a has inputs with mismatching deadness so
+  // it won't be clustered.  ctrl_trigger_b is okay to cluster but we don't
+  // cluster it because of b/118970344.
+  EXPECT_TRUE(clusters.empty());
 }
 
 TEST(XlaCompilationTest, RandomShape) {
@@ -923,9 +919,8 @@ TEST(XlaCompilationTest, RandomShapeOnXlaDevice) {
   TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph));
 
   std::unordered_map<string, string> clusters = GetClusters(*graph);
-  EXPECT_NE(clusters["test/shape_rng"], "");
-  EXPECT_NE(clusters["test/reshape"], "");
-  EXPECT_NE(clusters["test/shape_rng"], clusters["test/reshape"]);
+  EXPECT_EQ(clusters["test/shape_rng"], "");
+  EXPECT_EQ(clusters["test/reshape"], "");
 }
 
 TEST(XlaCompilationTest, TensorArrayShapeOnXlaDevice) {
@@ -1088,7 +1083,7 @@ TEST(XlaCompilationTest, ClusterStatefulRandomOpOnXlaDevice) {
   EXPECT_NE(clusters["test/c"], "");
 }
 
-TEST(XlaCompilationTest, DontAutoclusterStatefulRandomOp) {
+TEST(XlaCompilationTest, DontAutoClusterStatefulRandomOp) {
   Scope root = Scope::NewRootScope().ExitOnError();
   Output shape = ops::Const(root.WithOpName("test/shape_shape"), {200, 200});
   Output a = ops::RandomUniform(root.WithOpName("test/a"), shape, DT_FLOAT);
@@ -1104,5 +1099,53 @@ TEST(XlaCompilationTest, DontAutoclusterStatefulRandomOp) {
   EXPECT_EQ(clusters["test/a"], "");
   EXPECT_EQ(clusters["test/b"], "");
 }
+
+TEST(XlaCompilationTest, ClusterDummyOpsOnXlaDevice) {
+  absl::string_view xla_cpu_device =
+      "/job:worker/replica:0/task:0/device:XLA_CPU:0";
+
+  Scope root = Scope::NewRootScope().ExitOnError();
+  Output a = ops::Placeholder(root.WithOpName("test/a"), DT_FLOAT);
+  Output b = ops::Placeholder(root.WithOpName("test/b"), DT_FLOAT);
+  Output check =
+      ops::CheckNumerics(root.WithOpName("test/check"), a, "test/check");
+  Output ge = ops::GreaterEqual(root.WithOpName("test/greaterequal"), check, b);
+  Operation assert = ops::Assert(root.WithOpName("test/assert"), ge, {a, b});
+
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+  TF_ASSERT_OK(root.ToGraph(graph.get()));
+
+  for (Node* n : graph->nodes()) {
+    if (absl::StartsWith(n->name(), /*prefix=*/"test/")) {
+      n->set_assigned_device_name(string(xla_cpu_device));
+    }
+  }
+  TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph));
+
+  std::unordered_map<string, string> clusters = GetClusters(*graph);
+  EXPECT_NE(clusters["test/check"], "");
+  EXPECT_NE(clusters["test/greaterequal"], "");
+  EXPECT_NE(clusters["test/assert"], "");
+}
+
+TEST(XlaCompilationTest, DontAutoClusterDummyOps) {
+  Scope root = Scope::NewRootScope().ExitOnError();
+  Output a = ops::Placeholder(root.WithOpName("test/a"), DT_FLOAT);
+  Output b = ops::Placeholder(root.WithOpName("test/b"), DT_FLOAT);
+  Output check =
+      ops::CheckNumerics(root.WithOpName("test/check"), a, "test/check");
+  Output ge = ops::GreaterEqual(root.WithOpName("test/greaterequal"), check, b);
+  Operation assert = ops::Assert(root.WithOpName("test/assert"), ge, {a, b});
+
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+  TF_ASSERT_OK(root.ToGraph(graph.get()));
+
+  TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph));
+
+  std::unordered_map<string, string> clusters = GetClusters(*graph);
+  EXPECT_EQ(clusters["test/assert"], "");
+  EXPECT_EQ(clusters["test/check"], "");
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/partially_decluster_pass.cc b/tensorflow/compiler/jit/partially_decluster_pass.cc
index 5b961032233..36b345ecbff 100644
--- a/tensorflow/compiler/jit/partially_decluster_pass.cc
+++ b/tensorflow/compiler/jit/partially_decluster_pass.cc
@@ -133,6 +133,10 @@ Status PartiallyDeclusterNode(Graph* graph, Node* n) {
     graph->RemoveEdge(out_edge_to_clone);
   }
 
+  if (n->out_edges().empty()) {
+    graph->RemoveNode(n);
+  }
+
   return Status::OK();
 }
 
@@ -191,6 +195,10 @@ Status PartiallyDeclusterToRemoveDeviceToHostCopies(Graph* graph) {
     }
   }
 
+  // Recompute post order since PartiallyDeclusterNode may have deleted nodes.
+  post_order.clear();
+  GetPostOrder(*graph, &post_order, /*stable_comparator=*/NodeComparatorName(),
+               /*edge_filter=*/NotBackedge);
   nodes_to_partially_decluster.clear();
   TF_RETURN_IF_ERROR(
       FindNodesToDecluster(*graph, &nodes_to_partially_decluster, post_order));
@@ -210,7 +218,8 @@ bool IsIntraClusterEdge(const Edge& edge) {
 bool IsMustCompileDevice(const DeviceType& device_type) {
   const XlaOpRegistry::DeviceRegistration* registration;
   if (XlaOpRegistry::GetCompilationDevice(device_type.type(), &registration)) {
-    return registration->requires_compilation;
+    return registration->autoclustering_policy ==
+           XlaOpRegistry::AutoclusteringPolicy::kAlways;
   }
 
   return false;
diff --git a/tensorflow/compiler/jit/partially_decluster_pass_test.cc b/tensorflow/compiler/jit/partially_decluster_pass_test.cc
index 74d5ef57184..1fc5da5071f 100644
--- a/tensorflow/compiler/jit/partially_decluster_pass_test.cc
+++ b/tensorflow/compiler/jit/partially_decluster_pass_test.cc
@@ -437,5 +437,32 @@ TEST(PartiallyDeclusterPassTest, DontDeclusterNonTensorFlowOps) {
   EXPECT_EQ(GetXlaClusterForNode(*n), "cluster_0");
 }
 
+TEST(PartiallyDeclusterPassTest, EliminatedUnusedNodes) {
+  const char* const kClusteredProducer0Name = "ClusteredProducer0";
+  const char* const kClusteredProducer1Name = "ClusteredProducer1";
+
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+  {
+    GraphDefBuilder builder(GraphDefBuilder::kFailImmediately);
+    Node* input =
+        ops::SourceOp("FakeNullary", builder.opts().WithName("Input"));
+    Node* clustered_producer_0 =
+        ops::BinaryOp("FakeBinary", input, input,
+                      builder.opts().WithName(kClusteredProducer0Name));
+    Node* clustered_producer_1 =
+        ops::BinaryOp("FakeBinary", clustered_producer_0, input,
+                      builder.opts().WithName(kClusteredProducer1Name));
+    ops::BinaryOp("FakeBinary", clustered_producer_1, input,
+                  builder.opts().WithName("UnclusteredConsumer"));
+    clustered_producer_0->AddAttr(kXlaClusterAttr, "cluster_0");
+    clustered_producer_1->AddAttr(kXlaClusterAttr, "cluster_0");
+    TF_EXPECT_OK(GraphDefBuilderToGraph(builder, graph.get()));
+  }
+
+  TF_ASSERT_OK(PartiallyDecluster(&graph));
+  EXPECT_EQ(FindNodeByName(*graph, kClusteredProducer0Name), nullptr);
+  EXPECT_EQ(FindNodeByName(*graph, kClusteredProducer1Name), nullptr);
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/producer_consumer_queue.h b/tensorflow/compiler/jit/producer_consumer_queue.h
deleted file mode 100644
index 7c8c04152d2..00000000000
--- a/tensorflow/compiler/jit/producer_consumer_queue.h
+++ /dev/null
@@ -1,132 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_JIT_PRODUCER_CONSUMER_QUEUE_H_
-#define TENSORFLOW_COMPILER_JIT_PRODUCER_CONSUMER_QUEUE_H_
-
-#include <deque>
-#include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/platform/mutex.h"
-
-namespace tensorflow {
-
-// A thread-safe, first-in-first-out queue.
-template <typename T>
-class ProducerConsumerQueue {
- public:
-  ProducerConsumerQueue()
-      : capacity_(std::numeric_limits<std::size_t>::max()) {}
-  ~ProducerConsumerQueue() = default;
-
-  // Wait until the queue is non-full, then append a copy of v.
-  void Put(const T &v);
-
-  // Wait until the queue is non-empty, then remove and return the head value.
-  T Get();
-
-  // If the queue is non-empty, remove the head value, placing it in *pv, and
-  // return true; otherwise return false.
-  bool TryGet(T *pv);
-
-  // Set the capacity of the queue; the queue is full whenever count() >=
-  // capacity().  The initial value is the maximum size_t.  Requires size > 0.
-  void set_capacity(std::size_t size);
-
-  // Return the capacity of the queue.
-  std::size_t capacity() const;
-
-  // Return the number of elements in the queue.
-  std::size_t count() const;
-
-  // Implementation details follow.  Clients should ignore.
- private:
-  mutable tensorflow::mutex mu_;  // protects all fields below
-  tensorflow::condition_variable non_empty_ GUARDED_BY(mu_);
-  tensorflow::condition_variable non_full_ GUARDED_BY(mu_);
-  std::size_t capacity_ GUARDED_BY(mu_);
-  std::deque<T> queue_ GUARDED_BY(mu_);
-
-  TF_DISALLOW_COPY_AND_ASSIGN(ProducerConsumerQueue);
-};
-
-// ------------------------------------------------------
-// Implementation details follow.  Clients should ignore.
-
-// Wait until the queue is non-full, then append a copy of v.
-template <typename T>
-void ProducerConsumerQueue<T>::Put(const T &v) {
-  mutex_lock lock(mu_);
-  while (queue_.size() >= capacity_) {
-    non_full_.wait(lock);
-  }
-  queue_.push_back(v);
-  non_empty_.notify_one();
-}
-
-// Wait until the queue is non-empty, then remove and return the head value.
-template <typename T>
-T ProducerConsumerQueue<T>::Get() {
-  mutex_lock lock(mu_);
-  while (queue_.empty()) {
-    non_empty_.wait(lock);
-  }
-  non_full_.notify_one();
-  T result_value = queue_.front();
-  queue_.pop_front();
-  return result_value;
-}
-
-// If the queue is non-empty, remove the head value, placing it in *pv, and
-// return true; otherwise return false.
-template <typename T>
-bool ProducerConsumerQueue<T>::TryGet(T *pv) {
-  mutex_lock lock(mu_);
-  bool got_element = !queue_.empty();
-  if (got_element) {
-    non_full_.notify_one();
-    *pv = queue_.front();
-    queue_.pop_front();
-  }
-  return got_element;
-}
-
-// Set the capacity of the queue; the queue is full whenever count() >=
-// capacity().  The initial value is the maximum size_t.  Requires size > 0.
-template <typename T>
-void ProducerConsumerQueue<T>::set_capacity(std::size_t size) {
-  mutex_lock lock(mu_);
-  CHECK_NE(size, 0);
-  capacity_ = size;
-  non_full_.notify_all();
-}
-
-// Return the capacity of the queue.
-template <typename T>
-std::size_t ProducerConsumerQueue<T>::capacity() const {
-  mutex_lock lock(mu_);
-  std::size_t max_elements = capacity_;
-  return max_elements;
-}
-
-// Return the number of elements in the queue.
-template <typename T>
-std::size_t ProducerConsumerQueue<T>::count() const {
-  mutex_lock lock(mu_);
-  std::size_t num_elements = queue_.size();
-  return num_elements;
-}
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_COMPILER_JIT_PRODUCER_CONSUMER_QUEUE_H_
diff --git a/tensorflow/compiler/jit/producer_consumer_queue_test.cc b/tensorflow/compiler/jit/producer_consumer_queue_test.cc
deleted file mode 100644
index f61260c6e52..00000000000
--- a/tensorflow/compiler/jit/producer_consumer_queue_test.cc
+++ /dev/null
@@ -1,139 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/jit/producer_consumer_queue.h"
-
-#include "tensorflow/core/lib/core/threadpool.h"
-#include "tensorflow/core/platform/env.h"
-#include "tensorflow/core/platform/mutex.h"
-#include "tensorflow/core/platform/test.h"
-
-namespace tensorflow {
-namespace {
-
-typedef ProducerConsumerQueue<int> IntQueue;
-
-// Insert integers between low inclusive and high exclusive into q.
-void PushRange(IntQueue *q, int low, int high) {
-  while (low != high) {
-    q->Put(low);
-    VLOG(2) << "Pushing " << low;
-    ++low;
-  }
-}
-
-// Push the numbers between 0 and 999 inclusive from several threads in the
-// pool.
-void PushRanges(IntQueue *queue, thread::ThreadPool *pool) {
-  VLOG(1) << "Adding 20-36";
-  pool->Schedule([queue] { PushRange(queue, 20, 36); });
-  VLOG(1) << "Adding 7-20";
-  pool->Schedule([queue] { PushRange(queue, 7, 20); });
-  VLOG(1) << "Adding 36-501";
-  pool->Schedule([queue] { PushRange(queue, 36, 501); });
-  VLOG(1) << "Adding 501-1000";
-  pool->Schedule([queue] { PushRange(queue, 501, 1000); });
-  VLOG(1) << "Adding 0-5";
-  pool->Schedule([queue] { PushRange(queue, 0, 5); });
-  VLOG(1) << "Adding 5-7";
-  pool->Schedule([queue] { PushRange(queue, 5, 7); });
-}
-
-// Pop elements from queue using Get().  Make sure that exactly <high> elements
-// were present and their values are all integers between 0 and high-1
-// inclusive.
-void GetRange(IntQueue *queue, int high) {
-  VLOG(1) << "Testing Wait";
-  std::vector<int> results;
-  for (int i = 0; i != high; ++i) {
-    int r = queue->Get();
-    VLOG(2) << "Waited and got " << r;
-    results.push_back(r);
-  }
-  CHECK_EQ(queue->count(), 0);
-  std::sort(results.begin(), results.end());
-  for (int i = 0; i != high; ++i) {
-    CHECK(results[i] == i);
-  }
-}
-
-// Pop elements from queue using TryGet().  Make sure that exactly <high>
-// elements were present and their values are all integers between 0 and high-1
-// inclusive.
-void TryGetRange(IntQueue *queue, int high) {
-  std::vector<int> results;
-  // Give up if we don't get all the elements back from the queue
-  // in 10 seconds.
-  int timeout = 10;
-  int r;
-  for (int i = 0; i != high; ++i) {
-    while (!queue->TryGet(&r)) {
-      if (!timeout--) {
-        LOG(FATAL) << "Can't find all elements in the queue";
-      }
-      VLOG(1) << "Sleeping for a second...";
-      sleep(1);
-    }
-    VLOG(2) << "Popped " << r;
-    results.push_back(r);
-  }
-  CHECK_EQ(queue->count(), 0);
-  CHECK(!queue->TryGet(&r));
-  std::sort(results.begin(), results.end());
-  for (int i = 0; i != high; ++i) {
-    CHECK_EQ(i, results[i]);
-  }
-}
-
-const int kNumThreads = 15;
-
-TEST(ProducerConsumerQueue, GetRange) {
-  IntQueue queue;
-  {
-    thread::ThreadPool pool(Env::Default(), "test", kNumThreads);
-    PushRanges(&queue, &pool);
-  }
-  GetRange(&queue, 1000);
-}
-
-TEST(ProducerConsumerQueue, TryGetRange) {
-  IntQueue queue;
-  {
-    thread::ThreadPool pool(Env::Default(), "test", kNumThreads);
-    PushRanges(&queue, &pool);
-  }
-  TryGetRange(&queue, 1000);
-}
-
-TEST(ProducerConsumerQueue, ParallelGetRange) {
-  IntQueue queue;
-  {
-    thread::ThreadPool pool(Env::Default(), "test", kNumThreads);
-    pool.Schedule([&queue] { GetRange(&queue, 1000); });
-    PushRanges(&queue, &pool);
-  }
-}
-
-TEST(ProducerConsumerQueue, ParallelTryGetRange) {
-  IntQueue queue;
-  {
-    thread::ThreadPool pool(Env::Default(), "test", kNumThreads);
-    pool.Schedule([&queue] { TryGetRange(&queue, 1000); });
-    PushRanges(&queue, &pool);
-  }
-}
-
-}  // namespace
-}  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/xla_compilation_cache.cc b/tensorflow/compiler/jit/xla_compilation_cache.cc
index 4a5ea9e0a5f..3df5479a55e 100644
--- a/tensorflow/compiler/jit/xla_compilation_cache.cc
+++ b/tensorflow/compiler/jit/xla_compilation_cache.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <numeric>
 
+#include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/tf2xla/dump_graph.h"
 #include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/tf2xla/type_util.h"
@@ -65,14 +66,14 @@ string XlaCompilationCache::DebugString() {
 
 // Compute a string signature which encodes the shapes of the
 // arguments in the supplied list.
-string XlaCompilationCache::SignatureDebugString(const Signature& sig) {
-  string result = sig.name;
-  for (const auto& a : sig.arg_types) {
+string XlaCompilationCache::Signature::HumanString() const {
+  string result = name;
+  for (const auto& a : arg_types) {
     absl::StrAppend(&result, ",", DataTypeString(a.first),
                     a.second.DebugString());
   }
 
-  for (const auto& v : sig.arg_values) {
+  for (const auto& v : arg_values) {
     absl::StrAppend(&result, "; ", v.DebugString());
   }
   return result;
@@ -84,7 +85,9 @@ bool XlaCompilationCache::Signature::operator==(const Signature& other) const {
 
   if (arg_values.size() != other.arg_values.size()) return false;
   for (int i = 0; i < arg_values.size(); ++i) {
-    if (arg_values[i].tensor_data() != other.arg_values[i].tensor_data()) {
+    if (arg_values[i].dtype() != other.arg_values[i].dtype() ||
+        arg_values[i].shape() != other.arg_values[i].shape() ||
+        arg_values[i].tensor_data() != other.arg_values[i].tensor_data()) {
       return false;
     }
   }
@@ -108,96 +111,30 @@ uint64 XlaCompilationCache::Signature::Hash::operator()(
   return h;
 }
 
-Status XlaCompilationCache::BuildSignature(
-    const NameAttrList& function, const std::map<int, Tensor>& constant_args,
-    const std::map<int, OptionalTensor>& variable_args, OpKernelContext* ctx,
-    Signature* signature) {
-  signature->name = Canonicalize(function.name(), AttrSlice(&function.attr()));
-  signature->arg_values.reserve(constant_args.size());
-
-  signature->arg_types.reserve(ctx->num_inputs() - constant_args.size());
-
-  for (int i = 0; i < ctx->num_inputs(); ++i) {
-    if (constant_args.count(i) > 0) {
-      // Use the values of compile time constants in the signature.
-      signature->arg_values.push_back(constant_args.at(i));
-    } else if (variable_args.count(i) > 0) {
-      const OptionalTensor& variable = variable_args.at(i);
-      if (variable.present) {
-        signature->arg_types.emplace_back(variable.value.dtype(),
-                                          variable.value.shape());
-      } else {
-        signature->arg_types.emplace_back(DT_INVALID, TensorShape());
-      }
-    } else {
-      signature->arg_types.emplace_back(ctx->input_dtype(i),
-                                        ctx->input(i).shape());
+xla::StatusOr<XlaCompilationCache::Signature>
+XlaCompilationCache::BuildSignature(
+    const NameAttrList& function,
+    absl::Span<const XlaCompiler::Argument> args) {
+  Signature signature;
+  signature.name = Canonicalize(function.name(), AttrSlice(&function.attr()));
+  for (const XlaCompiler::Argument& arg : args) {
+    switch (arg.kind) {
+      case XlaCompiler::Argument::kConstant:
+        signature.arg_values.push_back(arg.constant_value);
+        break;
+      case XlaCompiler::Argument::kParameter:
+      case XlaCompiler::Argument::kResource:
+        signature.arg_types.emplace_back(arg.type, arg.shape);
+        break;
+      default:
+        return errors::InvalidArgument(
+            "Unhandled argument kind in XlaCompilationCache: ",
+            arg.HumanString());
     }
   }
-  return Status::OK();
+  return std::move(signature);
 }
 
-namespace {
-
-// Builds a XlaCompiler::Argument vector from the arguments to the XlaLaunch op.
-Status BuildArguments(const std::map<int, Tensor>& constant_args,
-                      const std::map<int, OptionalTensor>& variable_args,
-                      OpKernelContext* ctx,
-                      std::vector<XlaCompiler::Argument>* args) {
-  args->resize(ctx->num_inputs());
-
-  for (int64 input_num = 0; input_num < ctx->num_inputs(); ++input_num) {
-    XlaCompiler::Argument& arg = (*args)[input_num];
-    if (constant_args.count(input_num) > 0) {
-      // Handles compile-time constants.
-      const Tensor& input = constant_args.at(input_num);
-      TF_RET_CHECK(input.dtype() != DT_RESOURCE);
-      arg.kind = XlaCompiler::Argument::kConstant;
-      arg.type = input.dtype();
-      arg.shape = input.shape();
-      arg.constant_value = input;
-    } else if (variable_args.count(input_num) == 0) {
-      // Handles the non-constant arguments.
-      const Tensor& input = ctx->input(input_num);
-      TF_RET_CHECK(input.dtype() != DT_RESOURCE);
-      if (input.NumElements() > 0) {
-        arg.kind = XlaCompiler::Argument::kParameter;
-      } else {
-        arg.kind = XlaCompiler::Argument::kConstant;
-        arg.constant_value = input;
-      }
-      arg.type = input.dtype();
-      arg.shape = input.shape();
-    } else {
-      // Handles resource variables.
-      const Tensor& input = ctx->input(input_num);
-      TF_RET_CHECK(input.dtype() == DT_RESOURCE);
-      const OptionalTensor& variable = variable_args.at(input_num);
-      arg.name = variable.name;
-      arg.kind = XlaCompiler::Argument::kResource;
-      arg.resource_kind = XlaResource::kVariable;
-      if (variable.present) {
-        const Tensor& value = variable.value;
-        arg.type = value.dtype();
-        arg.shape = value.shape();
-        arg.initialized = true;
-      } else {
-        // The values of uninitialized variables are not passed as inputs, since
-        // they are meaningless. However, it is legal to assign to a resource
-        // variable for the first time inside the XLA computation, so we do
-        // permit uninitialized variables.
-        arg.initialized = false;
-        arg.type = DT_INVALID;
-        arg.shape = TensorShape();
-      }
-    }
-  }
-
-  return Status::OK();
-}
-
-}  // namespace
-
 Status XlaCompilationCache::BuildExecutable(
     const XlaCompiler::Options& options,
     const XlaCompiler::CompilationResult& result,
@@ -227,25 +164,38 @@ Status XlaCompilationCache::BuildExecutable(
 
 Status XlaCompilationCache::Compile(
     const XlaCompiler::Options& options, const NameAttrList& function,
-    const std::map<int, Tensor>& constant_args,
-    const std::map<int, OptionalTensor>& variable_args, OpKernelContext* ctx,
+    absl::Span<const XlaCompiler::Argument> args,
     const XlaCompiler::CompileOptions& compile_options,
     CompileMode compile_mode,
     const XlaCompiler::CompilationResult** out_compilation_result,
     xla::LocalExecutable** out_executable) {
-  // Set the compile threshold to 1 to implement CompileMode::kStrict.
-  int64 compile_threshold =
-      compile_mode == CompileMode::kLazy ? kDefaultCompilationThreshold : 1;
-  return CompileImpl(options, function, constant_args, variable_args, ctx,
-                     compile_options, /*compile_single_op=*/false,
+  absl::optional<int64> compile_threshold;
+  if (compile_mode == CompileMode::kLazy) {
+    compile_threshold = kDefaultCompilationThreshold;
+  }
+  auto compile_fn = [&](XlaCompiler* compiler,
+                        XlaCompiler::CompilationResult* result) {
+    return compiler->CompileFunction(compile_options, function, args, result);
+  };
+  return CompileImpl(options, function, args, compile_fn,
                      /*compile_threshold=*/compile_threshold,
                      out_compilation_result, out_executable);
 }
 
+static bool IsMegamorphic(int64 compile_count, int64 execution_count) {
+  const int64 kCompileThreshold = 10;
+  const int64 kMinExecutionsPerCompile = 50;
+
+  // This heuristic is trying to capture the following property: have we sunk a
+  // certain minimum amount of compile time into the cluster that didn't quite
+  // "pay off"?
+  return compile_count > kCompileThreshold &&
+         execution_count < kMinExecutionsPerCompile * compile_count;
+}
+
 Status XlaCompilationCache::CompileSingleOp(
     const XlaCompiler::Options& options,
-    const std::map<int, Tensor>& constant_args,
-    const std::map<int, OptionalTensor>& variable_args, OpKernelContext* ctx,
+    absl::Span<const XlaCompiler::Argument> args, OpKernelContext* ctx,
     const XlaCompiler::CompileOptions& compile_options,
     const XlaCompiler::CompilationResult** out_compilation_result,
     xla::LocalExecutable** out_executable) {
@@ -253,54 +203,41 @@ Status XlaCompilationCache::CompileSingleOp(
   NameAttrList name;
   name.set_name(def.op());
   *name.mutable_attr() = def.attr();
-  return CompileImpl(options, name, constant_args, variable_args, ctx,
-                     compile_options,
-                     /*compile_single_op=*/true, /*compile_threshold=*/1,
+  auto compile_op = [&](XlaCompiler* compiler,
+                        XlaCompiler::CompilationResult* result) {
+    std::vector<DataType> result_dtypes(ctx->num_outputs());
+    for (int i = 0; i < result_dtypes.size(); ++i) {
+      result_dtypes[i] = ctx->expected_output_dtype(i);
+    }
+    return compiler->CompileSingleOp(compile_options, ctx->op_kernel().def(),
+                                     args, result_dtypes, result);
+  };
+  return CompileImpl(options, name, args, compile_op,
+                     /*compile_threshold=*/absl::nullopt,
                      out_compilation_result, out_executable);
 }
 
 Status XlaCompilationCache::CompileImpl(
     const XlaCompiler::Options& options, const NameAttrList& function,
-    const std::map<int, Tensor>& constant_args,
-    const std::map<int, OptionalTensor>& variable_args, OpKernelContext* ctx,
-    const XlaCompiler::CompileOptions& compile_options, bool compile_single_op,
-    int64 compile_threshold,
+    absl::Span<const XlaCompiler::Argument> args,
+    const std::function<Status(XlaCompiler* compiler,
+                               XlaCompiler::CompilationResult*)>& compile_fn,
+    absl::optional<int64> compile_threshold,
     const XlaCompiler::CompilationResult** out_compilation_result,
     xla::LocalExecutable** out_executable) {
   DCHECK_NE(out_executable, nullptr);
   VLOG(2) << "XlaCompilationCache::Compile " << DebugString();
 
   if (VLOG_IS_ON(2)) {
-    VLOG(2) << "num_inputs=" << ctx->num_inputs()
-            << " num_constant_args=" << constant_args.size()
-            << " num_variable_args=" << variable_args.size();
-    for (int i = 0; i < ctx->num_inputs(); i++) {
-      TensorShape shape = ctx->input(i).shape();
-      VLOG(2) << i << ": dtype=" << DataTypeString(ctx->input_dtype(i))
-              << " present=" << ctx->has_input(i)
-              << " shape=" << shape.DebugString();
-    }
-    for (auto& iterator : variable_args) {
-      const OptionalTensor& variable = iterator.second;
-      VLOG(2) << "variable present=" << variable.present
-              << " type=" << DataTypeString(variable.value.dtype())
-              << " shape=" << variable.value.shape().DebugString()
-              << " TF arg= " << iterator.first;
-    }
-    VLOG(2) << "num_outputs = " << ctx->num_outputs();
-    for (int i = 0; i < ctx->num_outputs(); i++) {
-      VLOG(2) << i << ": dtype=" << ctx->expected_output_dtype(i);
+    VLOG(2) << "num_inputs=" << args.size();
+    for (int i = 0; i < args.size(); i++) {
+      VLOG(2) << i << ": " << args[i].HumanString();
     }
   }
 
-  TF_RET_CHECK(constant_args.size() + variable_args.size() <=
-               ctx->num_inputs());
+  TF_ASSIGN_OR_RETURN(Signature signature, BuildSignature(function, args));
+  VLOG(2) << "Signature: " << signature.HumanString();
 
-  Signature signature;
-  TF_RETURN_IF_ERROR(
-      BuildSignature(function, constant_args, variable_args, ctx, &signature));
-
-  VLOG(2) << "Signature: " << SignatureDebugString(signature);
   // The outer lock protects the existence of the cache entry. It does not
   // protect the contents of the cache entry.
   Entry* entry;
@@ -319,25 +256,67 @@ Status XlaCompilationCache::CompileImpl(
   // (since they get the benefit of XLA right away without waiting for warmup)
   // and doesn't hurt much for dynamically shaped TensorFlow graphs (we "pay" at
   // most one cluster-compilation's worth of compile time).
-  bool is_first_execution = [&] {
+  bool is_first_execution;
+
+  // We avoid compiling clusters that have "gone megamorphic" i.e. have an
+  // excessive amount of shape dynamism.
+  bool is_megamorphic;
+
+  {
     mutex_lock lock(cluster_compile_stats_mu_);
     auto it =
         cluster_compile_stats_.emplace(function.name(), ClusterCompileStats{})
             .first;
-    return it->second.execution_count++ == 0;
-  }();
+    is_first_execution = it->second.execution_count++ == 0;
+
+    // The is_megamorphic bit is "sticky".  We assume clusters that have been
+    // observed to be megamorphic once stay megamorphic forever.
+    it->second.is_megamorphic |=
+        IsMegamorphic(/*compile_count=*/it->second.compile_count,
+                      /*execution_count=*/it->second.execution_count);
+    is_megamorphic = it->second.is_megamorphic;
+  }
 
   // Acquire the cache entry lock and compile, if necessary.
   // TODO(phawkins): this locking will need to be restructured when we implement
   // cache eviction.
   mutex_lock entry_lock(entry->mu);
   int64 current_request_count = ++entry->request_count;
+  VLOG(2) << "Compilation cache entry hit: " << entry->compiled
+          << " signature: " << signature.HumanString() << " with request count "
+          << current_request_count << " and compile threshold "
+          << compile_threshold.value_or(0);
   if (!entry->compiled) {
-    VLOG(2) << "Compilation cache miss for signature: "
-            << SignatureDebugString(signature) << " with request count "
-            << current_request_count << " and compile threshold "
-            << compile_threshold;
-    if (!is_first_execution && current_request_count < compile_threshold) {
+    const bool should_compile = [&] {
+      if (!compile_threshold.has_value()) {
+        // Lazy compilation is disabled.
+        return true;
+      }
+
+      if (is_megamorphic) {
+        VLOG(3) << "Not compiling cluster " << function.name()
+                << " because it is megamorphic.";
+        return false;
+      }
+
+      if (is_first_execution) {
+        return true;
+      }
+
+      bool reached_compile_threshold =
+          current_request_count >= *compile_threshold;
+      if (!reached_compile_threshold) {
+        VLOG(3)
+            << "Not compiling cluster " << function.name()
+            << " because it has not reached compile threshold; threshold is "
+            << *compile_threshold << " execution count "
+            << current_request_count << ".";
+      }
+      return reached_compile_threshold;
+    }();
+
+    if (!should_compile) {
+      VLOG(2) << "Not compiling for signature: " << signature.HumanString();
       *out_compilation_result = nullptr;
       *out_executable = nullptr;
       return Status::OK();
@@ -347,21 +326,12 @@ Status XlaCompilationCache::CompileImpl(
     const uint64 compile_start_us = env->NowMicros();
     // Do the actual JIT compilation without holding the lock (it can take
     // a long time.)
-    std::vector<XlaCompiler::Argument> args;
-    TF_RETURN_IF_ERROR(
-        BuildArguments(constant_args, variable_args, ctx, &args));
 
     XlaCompiler compiler(options);
     entry->compiled = true;
 
-    if (compile_single_op) {
-      entry->compilation_status =
-          compiler.CompileSingleOp(compile_options, signature.name, ctx, args,
-                                   &entry->compilation_result);
-    } else {
-      entry->compilation_status = compiler.CompileFunction(
-          compile_options, function, args, &entry->compilation_result);
-    }
+    entry->compilation_status =
+        compile_fn(&compiler, &entry->compilation_result);
     TF_RETURN_IF_ERROR(entry->compilation_status);
     CHECK_EQ(entry->executable.get(), nullptr);
     entry->compilation_status =
diff --git a/tensorflow/compiler/jit/xla_compilation_cache.h b/tensorflow/compiler/jit/xla_compilation_cache.h
index b43e5d40e64..846d0c963db 100644
--- a/tensorflow/compiler/jit/xla_compilation_cache.h
+++ b/tensorflow/compiler/jit/xla_compilation_cache.h
@@ -17,9 +17,12 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_JIT_XLA_COMPILATION_CACHE_H_
 
 #include "absl/container/flat_hash_map.h"
+#include "absl/types/optional.h"
+#include "absl/types/span.h"
 #include "tensorflow/compiler/tf2xla/xla_compiler.h"
 #include "tensorflow/compiler/tf2xla/xla_context.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/framework/graph.pb.h"
@@ -30,13 +33,6 @@ limitations under the License.
 
 namespace tensorflow {
 
-// Struct that represents a possibly-absent Tensor.
-struct OptionalTensor {
-  string name;           // A descriptive name
-  bool present = false;  // Is the tensor present?
-  Tensor value;          // If present, what is the Tensor's value?
-};
-
 // The XlaCompilationCache class caches the results of the XlaCompiler class,
 // which converts a Tensorflow graph into a compiled XLA compilation.
 //
@@ -58,11 +54,7 @@ class XlaCompilationCache : public ResourceBase {
   // Compiles a function into a XlaCompiler::CompilationResult that can be used
   // to execute an XLA Computation. Compilation results are cached.
   // `function` is the name of a Tensorflow function to compile.
-  // `constant_args` is a map of tensorflow argument number to its constant
-  //  value.
-  // `variable_args` is a snapshot of the current values of the
-  // resource variable arguments to `function`; uninitialized variables are
-  // represented by an absent OptionalTensor.
+  // `args` is a description of the arguments to the computation.
   //
   // `compile_mode` controls the behavior of the compilation cache on a cache
   // miss.  If `compile_mode` is `kLazy` then, based on some profitability
@@ -78,9 +70,7 @@ class XlaCompilationCache : public ResourceBase {
   // outputs.
   Status Compile(const XlaCompiler::Options& options,
                  const NameAttrList& function,
-                 const std::map<int, Tensor>& constant_args,
-                 const std::map<int, OptionalTensor>& variable_args,
-                 OpKernelContext* ctx,
+                 absl::Span<const XlaCompiler::Argument> args,
                  const XlaCompiler::CompileOptions& compile_options,
                  CompileMode compile_mode,
                  const XlaCompiler::CompilationResult** out_compilation_result,
@@ -90,8 +80,7 @@ class XlaCompilationCache : public ResourceBase {
   // XlaCompiler::CompileFunction.
   Status CompileSingleOp(
       const XlaCompiler::Options& options,
-      const std::map<int, Tensor>& constant_args,
-      const std::map<int, OptionalTensor>& variable_args, OpKernelContext* ctx,
+      absl::Span<const XlaCompiler::Argument> args, OpKernelContext* ctx,
       const XlaCompiler::CompileOptions& compile_options,
       const XlaCompiler::CompilationResult** out_compilation_result,
       xla::LocalExecutable** out_executable);
@@ -101,26 +90,6 @@ class XlaCompilationCache : public ResourceBase {
 
   string DebugString() override;
 
- private:
-  // Common implementation of Compile and CompileSingleOp.
-  Status CompileImpl(
-      const XlaCompiler::Options& options, const NameAttrList& function,
-      const std::map<int, Tensor>& constant_args,
-      const std::map<int, OptionalTensor>& variable_args, OpKernelContext* ctx,
-      const XlaCompiler::CompileOptions& compile_options,
-      bool compile_single_op, int64 compile_threshold,
-      const XlaCompiler::CompilationResult** out_compilation_result,
-      xla::LocalExecutable** out_executable);
-
-  // Takes `result` which has been compiled from a Tensorflow subgraph to a
-  // XLA computation already, and generates an XLA LocalExecutable `executable`.
-  Status BuildExecutable(const XlaCompiler::Options& options,
-                         const XlaCompiler::CompilationResult& result,
-                         std::unique_ptr<xla::LocalExecutable>* executable);
-
-  xla::LocalClient* const client_;
-  const DeviceType device_type_;
-
   // Describes the types, shapes and any compile-time constant arguments
   // to a kernel. Key that uniquely identifies a compilation output.
   struct Signature {
@@ -137,14 +106,35 @@ class XlaCompilationCache : public ResourceBase {
     struct Hash {
       uint64 operator()(const Signature& signature) const;
     };
+
+    // Returns a human-readable description of the signature.
+    string HumanString() const;
   };
-  static string SignatureDebugString(const Signature& sig);
 
   // Builds the signature for a compilation.
-  Status BuildSignature(const NameAttrList& function,
-                        const std::map<int, Tensor>& constant_args,
-                        const std::map<int, OptionalTensor>& variable_args,
-                        OpKernelContext* ctx, Signature* signature);
+  static xla::StatusOr<Signature> BuildSignature(
+      const NameAttrList& function,
+      absl::Span<const XlaCompiler::Argument> args);
+
+ private:
+  // Common implementation of Compile and CompileSingleOp.
+  Status CompileImpl(
+      const XlaCompiler::Options& options, const NameAttrList& function,
+      absl::Span<const XlaCompiler::Argument> args,
+      const std::function<Status(XlaCompiler* compiler,
+                                 XlaCompiler::CompilationResult*)>& compile_fn,
+      absl::optional<int64> compile_threshold,
+      const XlaCompiler::CompilationResult** out_compilation_result,
+      xla::LocalExecutable** out_executable);
+
+  // Takes `result` which has been compiled from a Tensorflow subgraph to a
+  // XLA computation already, and generates an XLA LocalExecutable `executable`.
+  Status BuildExecutable(const XlaCompiler::Options& options,
+                         const XlaCompiler::CompilationResult& result,
+                         std::unique_ptr<xla::LocalExecutable>* executable);
+
+  xla::LocalClient* const client_;
+  const DeviceType device_type_;
 
   // The value associated with a cache entry.
   struct Entry {
@@ -180,7 +170,13 @@ class XlaCompilationCache : public ResourceBase {
 
     // Cumulative time spent compiling the cluster.
     int64 cumulative_compile_time_us = 0;
+
+    // True if we have decided that this cluster is too dynamic (i.e. its shapes
+    // change too frequently) to profitably JIT compile.  Once a cluster is
+    // tagged megamorphic, it stays megamorphic forever.
+    bool is_megamorphic = false;
   };
+
   mutex cluster_compile_stats_mu_;
 
   // Maps cluster names to compilation statistics for said cluster.
diff --git a/tensorflow/compiler/jit/xla_compilation_cache_test.cc b/tensorflow/compiler/jit/xla_compilation_cache_test.cc
new file mode 100644
index 00000000000..018c7c219f4
--- /dev/null
+++ b/tensorflow/compiler/jit/xla_compilation_cache_test.cc
@@ -0,0 +1,54 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/jit/xla_compilation_cache.h"
+#include "tensorflow/compiler/tf2xla/shape_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace {
+
+TEST(XlaCompilationCacheTest, SignatureEquality) {
+  NameAttrList fn;
+  fn.set_name("afunction");
+  std::vector<XlaCompiler::Argument> args(1);
+  args[0].kind = XlaCompiler::Argument::kConstant;
+  args[0].type = DT_INT32;
+  args[0].shape = TensorShape({4, 0});
+  args[0].constant_value = Tensor(DT_INT32, {4, 0});
+  TF_ASSERT_OK_AND_ASSIGN(XlaCompilationCache::Signature s1,
+                          XlaCompilationCache::BuildSignature(fn, args));
+
+  args[0].type = DT_FLOAT;
+  args[0].constant_value = Tensor(DT_FLOAT, {4, 0});
+  TF_ASSERT_OK_AND_ASSIGN(XlaCompilationCache::Signature s2,
+                          XlaCompilationCache::BuildSignature(fn, args));
+
+  args[0].shape = TensorShape({0, 4});
+  args[0].constant_value = Tensor(DT_FLOAT, {0, 4});
+  TF_ASSERT_OK_AND_ASSIGN(XlaCompilationCache::Signature s3,
+                          XlaCompilationCache::BuildSignature(fn, args));
+
+  std::vector<XlaCompilationCache::Signature> signatures = {s1, s2, s3};
+  for (int i = 0; i < signatures.size(); ++i) {
+    for (int j = 0; j < signatures.size(); ++j) {
+      EXPECT_EQ(i == j, signatures[i] == signatures[j])
+          << signatures[i].HumanString() << " " << signatures[j].HumanString();
+    }
+  }
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/xla_compile_on_demand_op.cc b/tensorflow/compiler/jit/xla_compile_on_demand_op.cc
index 31cb32e3059..1fe612d43d1 100644
--- a/tensorflow/compiler/jit/xla_compile_on_demand_op.cc
+++ b/tensorflow/compiler/jit/xla_compile_on_demand_op.cc
@@ -187,8 +187,13 @@ Status XlaCompileOnDemandOp::Compile(
   compile_options.always_return_tuple = false;
 
   std::map<int, OptionalTensor> variable_args = GetVariables(ctx);
-  return cache->CompileSingleOp(options, constant_arguments, variable_args, ctx,
-                                compile_options, result, executable);
+
+  std::vector<XlaCompiler::Argument> args;
+  TF_RETURN_IF_ERROR(XlaComputationLaunchContext::BuildXlaCompilerArguments(
+      constant_arguments, variable_args, ctx, &args));
+
+  return cache->CompileSingleOp(options, args, ctx, compile_options, result,
+                                executable);
 }
 
 void XlaCompileOnDemandOp::Compute(OpKernelContext* ctx) {
diff --git a/tensorflow/compiler/jit/xla_cpu_device.cc b/tensorflow/compiler/jit/xla_cpu_device.cc
index cbfeb388050..116e0756036 100644
--- a/tensorflow/compiler/jit/xla_cpu_device.cc
+++ b/tensorflow/compiler/jit/xla_cpu_device.cc
@@ -42,8 +42,10 @@ Status XlaCpuDeviceFactory::CreateDevices(const SessionOptions& session_options,
 
   XlaOpRegistry::DeviceRegistration registration;
   registration.compilation_device_name = DEVICE_CPU_XLA_JIT;
-  registration.requires_compilation = !compile_on_demand;
-  registration.enable_jit_by_default = false;
+  registration.autoclustering_policy =
+      compile_on_demand
+          ? XlaOpRegistry::AutoclusteringPolicy::kIfExplicitlyRequested
+          : XlaOpRegistry::AutoclusteringPolicy::kAlways;
   registration.compile_resource_ops = true;
   XlaOpRegistry::RegisterCompilationDevice(DEVICE_XLA_CPU, registration);
 
diff --git a/tensorflow/compiler/jit/xla_device.cc b/tensorflow/compiler/jit/xla_device.cc
index 2289abd2df3..5c1b55cb57f 100644
--- a/tensorflow/compiler/jit/xla_device.cc
+++ b/tensorflow/compiler/jit/xla_device.cc
@@ -446,7 +446,7 @@ XlaDeviceOpRegistrations* RegisterXlaDeviceKernels(const char* device,
   // Any op assigned to the device that isn't rewritten by the graph rewriter
   // gets executed by a n XlaCompileOnDemandOp, which compiles it and executes
   // it just-in-time.
-  kernel_factory::OpKernelRegistrar::Factory factory =
+  OpKernel* (*factory)(OpKernelConstruction*) =
       [](OpKernelConstruction* context) -> OpKernel* {
     return new XlaCompileOnDemandOp(context);
   };
diff --git a/tensorflow/compiler/jit/xla_device.h b/tensorflow/compiler/jit/xla_device.h
index 8881b697bc8..49f53b477ef 100644
--- a/tensorflow/compiler/jit/xla_device.h
+++ b/tensorflow/compiler/jit/xla_device.h
@@ -112,6 +112,12 @@ class XlaDevice : public LocalDevice {
     // compute, host-to-device, and device-to-host communication.
     bool use_multiple_streams = false;
 
+    // A function that describes how the on-host shapes of
+    // a) argument and return value, for entry computations
+    // b) variables, for all computations,
+    // should be represented in XLA. Parameters/return values will be shaped
+    // according to this function, and reshaped back to/from their declared
+    // shapes for computations. Must be non-null.
     XlaCompiler::ShapeRepresentationFn shape_representation_fn;
 
     // If padded_shape_fn is empty, a default implementation that returns
diff --git a/tensorflow/compiler/jit/xla_device_context.cc b/tensorflow/compiler/jit/xla_device_context.cc
index eb3cf27624b..6e6532731e6 100644
--- a/tensorflow/compiler/jit/xla_device_context.cc
+++ b/tensorflow/compiler/jit/xla_device_context.cc
@@ -70,9 +70,12 @@ XlaDeviceContext::XlaDeviceContext(
   CHECK(device_to_host_stream_ != nullptr);
   CHECK(stream_ != nullptr);
   if (!shape_representation_fn_) {
-    shape_representation_fn_ =
-        [](const TensorShape& shape,
-           DataType dtype) -> xla::StatusOr<TensorShape> { return shape; };
+    shape_representation_fn_ = [](const TensorShape& shape,
+                                  DataType dtype) -> xla::StatusOr<xla::Shape> {
+      xla::Shape xla_shape;
+      TF_RETURN_IF_ERROR(TensorShapeToXLAShape(dtype, shape, &xla_shape));
+      return xla_shape;
+    };
   }
 }
 
@@ -99,7 +102,7 @@ void XlaDeviceContext::CopyCPUTensorToDevice(const Tensor* cpu_tensor,
   CHECK(xla_tensor);
 
   Status status = [&]() -> Status {
-    TF_ASSIGN_OR_RETURN(TensorShape shape,
+    TF_ASSIGN_OR_RETURN(xla::Shape shape,
                         shape_representation_fn_(device_tensor->shape(),
                                                  device_tensor->dtype()));
 
@@ -111,9 +114,15 @@ void XlaDeviceContext::CopyCPUTensorToDevice(const Tensor* cpu_tensor,
         xla_tensor->AllocateShapedBuffer(device_tensor->dtype(), shape, client_,
                                          stream_->parent()->device_ordinal()));
 
+    // The cpu_tensor and literal that we created here hold the data of host
+    // tensor in descending layout. The layout could be different from layout in
+    // device_tensor (but the logical shape has to be the same). The
+    // transfer_manager is responsible to do corresponding transposing when
+    // transferring the data to device.
     xla::BorrowingLiteral literal(
         static_cast<const char*>(DMAHelper::base(cpu_tensor)),
-        xla_tensor->shaped_buffer().on_host_shape());
+        xla::ShapeUtil::MakeShape(shape.element_type(),
+                                  xla::AsInt64Slice(shape.dimensions())));
 
     VLOG(1) << "Transfer to device as literal: " << literal.ToString() << " "
             << xla_tensor->shaped_buffer().ToString();
@@ -183,8 +192,15 @@ void XlaDeviceContext::CopyDeviceTensorToCPU(const Tensor* device_tensor,
   XlaTensor* xla_tensor = XlaTensor::FromTensor(device_tensor);
   xla_tensor->WaitForDefinitionEventOnStream(device_to_host_stream_.get());
 
+  // Transfer manager requires the shape of the shaped buffer to be the same as
+  // literal shape except for the layout.  Set the literal to use xla_tensor's
+  // shape as it is derived from the cpu_tensor's shape using
+  // shape_representation_fn_.
   xla::MutableBorrowingLiteral literal;
-  TF_CHECK_OK(HostTensorToMutableBorrowingLiteral(cpu_tensor, &literal));
+  TF_CHECK_OK(HostTensorToMutableBorrowingLiteral(
+      xla::LayoutUtil::GetWithDefaultLayout(
+          xla_tensor->shaped_buffer().on_host_shape()),
+      cpu_tensor, &literal));
 
   TensorReference ref(*device_tensor);
   transfer_manager_->TransferLiteralFromDevice(
diff --git a/tensorflow/compiler/jit/xla_device_ops.h b/tensorflow/compiler/jit/xla_device_ops.h
index 241ea8f60df..adf0f994b84 100644
--- a/tensorflow/compiler/jit/xla_device_ops.h
+++ b/tensorflow/compiler/jit/xla_device_ops.h
@@ -35,6 +35,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/resource_variable_ops.h"
 #include "tensorflow/core/kernels/sendrecv_ops.h"
 #include "tensorflow/core/kernels/shape_ops.h"
+#include "tensorflow/core/kernels/stack.h"
 #include "tensorflow/core/kernels/variable_ops.h"
 
 namespace tensorflow {
@@ -257,9 +258,27 @@ class XlaAssignVariableOp : public OpKernel {
                               .Device(DEVICE)                                  \
                               .TypeConstraint<string>("T")                     \
                               .HostMemory("input"),                            \
-                          RetvalOp);
+                          RetvalOp);                                           \
+                                                                               \
+  REGISTER_KERNEL_BUILDER(Name("StackV2")                                      \
+                              .Device(DEVICE)                                  \
+                              .HostMemory("max_size")                          \
+                              .HostMemory("handle"),                           \
+                          StackOp);                                            \
+  REGISTER_KERNEL_BUILDER(Name("StackPushV2")                                  \
+                              .Device(DEVICE)                                  \
+                              .HostMemory("handle")                            \
+                              .TypeConstraint("T", TYPES),                     \
+                          TemplatedStackPushOp</*allow_swapping=*/false>);     \
+  REGISTER_KERNEL_BUILDER(Name("StackPopV2")                                   \
+                              .Device(DEVICE)                                  \
+                              .HostMemory("handle")                            \
+                              .TypeConstraint("elem_type", TYPES),             \
+                          StackPopOp);                                         \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name("StackCloseV2").Device(DEVICE).HostMemory("handle"), StackCloseOp);
 
-// TODO(phawkins): currently we do not register the QueueEnqueueMany,
+// TODO(b/118881356): currently we do not register the QueueEnqueueMany,
 // QueueDequeueMany, or QueueDequeueUpTo kernels because they attempt to read
 // and write the tensors they access in order to concatenate them into a batch.
 // We would need either to call out to an XLA computation to perform the
diff --git a/tensorflow/compiler/jit/xla_gpu_device.cc b/tensorflow/compiler/jit/xla_gpu_device.cc
index 8f28b38b5e1..44197016958 100644
--- a/tensorflow/compiler/jit/xla_gpu_device.cc
+++ b/tensorflow/compiler/jit/xla_gpu_device.cc
@@ -37,8 +37,8 @@ Status XlaGpuDeviceFactory::CreateDevices(const SessionOptions& session_options,
                                           std::vector<Device*>* devices) {
   XlaOpRegistry::DeviceRegistration registration;
   registration.compilation_device_name = DEVICE_GPU_XLA_JIT;
-  registration.requires_compilation = true;
-  registration.enable_jit_by_default = false;
+  registration.autoclustering_policy =
+      XlaOpRegistry::AutoclusteringPolicy::kAlways;
   registration.compile_resource_ops = true;
   XlaOpRegistry::RegisterCompilationDevice(DEVICE_XLA_GPU, registration);
 
@@ -53,24 +53,25 @@ Status XlaGpuDeviceFactory::CreateDevices(const SessionOptions& session_options,
     return Status::OK();
   }
 
-  XlaDevice::Options options;
-  options.platform = platform.ValueOrDie();
-  options.device_name_prefix = name_prefix;
-  options.device_name = DEVICE_XLA_GPU;
-  options.device_ordinal = 0;
-  options.compilation_device_name = DEVICE_GPU_XLA_JIT;
-  options.use_multiple_streams = false;
-  auto device = absl::make_unique<XlaDevice>(session_options, options);
+  for (int i = 0; i < platform.ValueOrDie()->VisibleDeviceCount(); ++i) {
+    XlaDevice::Options options;
+    options.platform = platform.ValueOrDie();
+    options.device_name_prefix = name_prefix;
+    options.device_name = DEVICE_XLA_GPU;
+    options.device_ordinal = i;
+    options.compilation_device_name = DEVICE_GPU_XLA_JIT;
+    options.use_multiple_streams = true;
+    auto device = absl::make_unique<XlaDevice>(session_options, options);
 
-  // TODO(b/78468222): Uncomment after fixing this bug
-  // status = device->UseGpuDeviceInfo();
-  // if (!status.ok()) {
-  //  errors::AppendToMessage(&status, "while setting up ", DEVICE_GPU_XLA_JIT,
-  //                          " device");
-  //  return status;
-  // }
+    Status status = device->UseGpuDeviceInfo();
+    if (!status.ok()) {
+      errors::AppendToMessage(&status, "while setting up ", DEVICE_GPU_XLA_JIT,
+                              " device number ", i);
+      return status;
+    }
 
-  devices->push_back(device.release());
+    devices->push_back(device.release());
+  }
   return Status::OK();
 }
 
diff --git a/tensorflow/compiler/jit/xla_interpreter_device.cc b/tensorflow/compiler/jit/xla_interpreter_device.cc
index dc37362fd86..e828bae865d 100644
--- a/tensorflow/compiler/jit/xla_interpreter_device.cc
+++ b/tensorflow/compiler/jit/xla_interpreter_device.cc
@@ -45,8 +45,8 @@ Status XlaInterpreterDeviceFactory::CreateDevices(
 
   XlaOpRegistry::DeviceRegistration registration;
   registration.compilation_device_name = DEVICE_INTERPRETER_XLA_JIT;
-  registration.requires_compilation = true;
-  registration.enable_jit_by_default = false;
+  registration.autoclustering_policy =
+      XlaOpRegistry::AutoclusteringPolicy::kAlways;
   registration.compile_resource_ops = true;
   XlaOpRegistry::RegisterCompilationDevice(DEVICE_XLA_INTERPRETER,
                                            registration);
diff --git a/tensorflow/compiler/jit/xla_launch_util.cc b/tensorflow/compiler/jit/xla_launch_util.cc
index 6e51bfca4a1..3b0bda4caa1 100644
--- a/tensorflow/compiler/jit/xla_launch_util.cc
+++ b/tensorflow/compiler/jit/xla_launch_util.cc
@@ -191,40 +191,6 @@ Status XlaAllocator::Deallocate(int device_ordinal, se::DeviceMemoryBase mem) {
   return Status::OK();
 }
 
-namespace internal {
-// Return the 'index''th subtree of the given ShapedBuffer as a
-// ScopedShapedBuffer. The returned ScopedShapedBuffer takes ownership of the
-// subtree, and sets the input's buffer pointers to nullptr for the subtree.
-ScopedShapedBuffer ExtractSubShapedBuffer(
-    ShapedBuffer* shaped_buffer, int index,
-    xla::DeviceMemoryAllocator* allocator) {
-  const xla::Shape& on_host_shape = xla::ShapeUtil::GetTupleElementShape(
-      shaped_buffer->on_host_shape(), index);
-  const xla::Shape& on_device_shape = xla::ShapeUtil::GetTupleElementShape(
-      shaped_buffer->on_device_shape(), index);
-
-  ShapedBuffer sub_shaped_buffer(on_host_shape, on_device_shape,
-                                 shaped_buffer->platform(),
-                                 shaped_buffer->device_ordinal());
-
-  auto& shape_tree = shaped_buffer->buffers();
-  auto& sub_shape_tree = sub_shaped_buffer.buffers();
-  sub_shape_tree.CopySubtreeFrom(shape_tree,
-                                 /*source_base_index=*/{index},
-                                 /*target_base_index=*/{});
-  shape_tree.ForEachMutableElement(
-      [index](const xla::ShapeIndex& shape_index,
-              tensorflow::se::DeviceMemoryBase* data) {
-        // shape_index is empty for the root node. Ignore that.
-        if (!shape_index.empty() && shape_index[0] == index) {
-          *data = tensorflow::se::DeviceMemoryBase(nullptr, 0);
-        }
-      });
-  return ScopedShapedBuffer(std::move(sub_shaped_buffer), allocator);
-}
-}  // namespace internal
-using internal::ExtractSubShapedBuffer;
-
 XlaComputationLaunchContext::XlaComputationLaunchContext(
     xla::LocalClient* client, xla::DeviceMemoryAllocator* xla_allocator,
     bool allocate_xla_tensors, bool use_multiple_streams)
@@ -391,8 +357,7 @@ Status XlaComputationLaunchContext::PopulateOutputs(
           TF_RETURN_IF_ERROR(ctx->allocate_output(i, shape, &output_tensor));
           XlaTensor* xla_tensor = XlaTensor::FromTensor(output_tensor);
           if (xla_tensor) {
-            xla_tensor->set_shaped_buffer(ScopedShapedBuffer(
-                ExtractSubShapedBuffer(&output, output_num, xla_allocator_)));
+            xla_tensor->set_shaped_buffer(output.TakeSubTree({output_num}));
             if (use_multiple_streams_) {
               xla_tensor->ResetDefinitionEvent(definition_event, stream);
             }
@@ -445,7 +410,6 @@ Status XlaComputationLaunchContext::PopulateOutputs(
   for (int i = 0; i < kernel->resource_updates.size(); ++i) {
     Allocator* allocator = ctx->device()->GetAllocator({});
     const XlaCompiler::ResourceUpdate& write = kernel->resource_updates[i];
-    se::DeviceMemoryBase buffer = output.buffer({output_num});
 
     if (variable_infos[i].var()->tensor()->dtype() != write.type) {
       return errors::Internal("Mismatched type in variable write");
@@ -455,18 +419,20 @@ Status XlaComputationLaunchContext::PopulateOutputs(
       Tensor output_tensor;
       TF_RETURN_IF_ERROR(
           ctx->allocate_temp(write.type, write.shape, &output_tensor));
-      XlaTensor* xla_tensor = XlaTensor::FromTensor(&output_tensor);
-      CHECK(xla_tensor);
-      xla_tensor->set_shaped_buffer(
-          ExtractSubShapedBuffer(&output, output_num, xla_allocator_));
-      if (use_multiple_streams_) {
-        xla_tensor->ResetDefinitionEvent(definition_event, stream);
+      if (write.shape.num_elements() > 0) {
+        XlaTensor* xla_tensor = XlaTensor::FromTensor(&output_tensor);
+        CHECK(xla_tensor);
+        xla_tensor->set_shaped_buffer(output.TakeSubTree({output_num}));
+        if (use_multiple_streams_) {
+          xla_tensor->ResetDefinitionEvent(definition_event, stream);
+        }
       }
       *variable_infos[i].var()->tensor() = output_tensor;
     } else {
+      se::DeviceMemoryBase buffer = output.buffer({output_num});
+      output.set_buffer(xla::OwningDeviceMemory(), {output_num});
       Tensor output_tensor = XlaTensorBuffer::MakeTensor(
           write.type, write.shape, buffer, allocator);
-      output.set_buffer(xla::OwningDeviceMemory(), {output_num});
       *variable_infos[i].var()->tensor() = output_tensor;
     }
     ++output_num;
@@ -474,4 +440,60 @@ Status XlaComputationLaunchContext::PopulateOutputs(
   return Status::OK();
 }
 
+Status XlaComputationLaunchContext::BuildXlaCompilerArguments(
+    const std::map<int, Tensor>& constant_args,
+    const std::map<int, OptionalTensor>& variable_args, OpKernelContext* ctx,
+    std::vector<XlaCompiler::Argument>* args) {
+  args->resize(ctx->num_inputs());
+
+  for (int64 input_num = 0; input_num < ctx->num_inputs(); ++input_num) {
+    XlaCompiler::Argument& arg = (*args)[input_num];
+    if (constant_args.count(input_num) > 0) {
+      // Handles compile-time constants.
+      const Tensor& input = constant_args.at(input_num);
+      TF_RET_CHECK(input.dtype() != DT_RESOURCE);
+      arg.kind = XlaCompiler::Argument::kConstant;
+      arg.type = input.dtype();
+      arg.shape = input.shape();
+      arg.constant_value = input;
+    } else if (variable_args.count(input_num) == 0) {
+      // Handles the non-constant arguments.
+      const Tensor& input = ctx->input(input_num);
+      TF_RET_CHECK(input.dtype() != DT_RESOURCE);
+      if (input.NumElements() > 0) {
+        arg.kind = XlaCompiler::Argument::kParameter;
+      } else {
+        arg.kind = XlaCompiler::Argument::kConstant;
+        arg.constant_value = input;
+      }
+      arg.type = input.dtype();
+      arg.shape = input.shape();
+    } else {
+      // Handles resource variables.
+      const Tensor& input = ctx->input(input_num);
+      TF_RET_CHECK(input.dtype() == DT_RESOURCE);
+      const OptionalTensor& variable = variable_args.at(input_num);
+      arg.name = variable.name;
+      arg.kind = XlaCompiler::Argument::kResource;
+      arg.resource_kind = XlaResource::kVariable;
+      if (variable.present) {
+        const Tensor& value = variable.value;
+        arg.type = value.dtype();
+        arg.shape = value.shape();
+        arg.initialized = true;
+      } else {
+        // The values of uninitialized variables are not passed as inputs, since
+        // they are meaningless. However, it is legal to assign to a resource
+        // variable for the first time inside the XLA computation, so we do
+        // permit uninitialized variables.
+        arg.initialized = false;
+        arg.type = DT_INVALID;
+        arg.shape = TensorShape();
+      }
+    }
+  }
+
+  return Status::OK();
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/xla_launch_util.h b/tensorflow/compiler/jit/xla_launch_util.h
index 81e205d13f7..437db019a0e 100644
--- a/tensorflow/compiler/jit/xla_launch_util.h
+++ b/tensorflow/compiler/jit/xla_launch_util.h
@@ -35,6 +35,13 @@ limitations under the License.
 namespace tensorflow {
 class XlaAllocator;
 
+// Struct that represents a possibly-absent Tensor.
+struct OptionalTensor {
+  string name;           // A descriptive name
+  bool present = false;  // Is the tensor present?
+  Tensor value;          // If present, what is the Tensor's value?
+};
+
 // Takes a snapshot of the values of resource variable arguments, whose indices
 // are specified in `variable_indices` argument. We snapshot tensors that back
 // resource variables since concurrent updates may modify the shape, and it is
@@ -139,6 +146,13 @@ class XlaComputationLaunchContext {
                               bool allocate_xla_tensors,
                               bool use_multiple_streams);
 
+  // Builds a XlaCompiler::Argument vector from the arguments to an XlaLaunch
+  // op.
+  static Status BuildXlaCompilerArguments(
+      const std::map<int, Tensor>& constant_args,
+      const std::map<int, OptionalTensor>& variable_args, OpKernelContext* ctx,
+      std::vector<XlaCompiler::Argument>* args);
+
   // Add all inputs within `ctx` as XLA arguments (returned by arguments()).
   // `variables` is a map from TensorFlow argument number to resource variable.
   //
@@ -223,17 +237,6 @@ class XlaTensorBuffer : public TensorBuffer {
   Allocator* allocator_;
 };
 
-// Exposed in this header file for microbenchmarking purposes, but this is an
-// internal implementation detail.
-namespace internal {
-// Return the 'index''th subtree of the given ShapedBuffer as a
-// ScopedShapedBuffer. The returned ScopedShapedBuffer takes ownership of the
-// subtree, and sets the input's buffer pointers to nullptr for the subtree.
-xla::ScopedShapedBuffer ExtractSubShapedBuffer(
-    xla::ShapedBuffer* shaped_buffer, int index,
-    xla::DeviceMemoryAllocator* allocator);
-}  // namespace internal
-
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_COMPILER_JIT_XLA_LAUNCH_UTIL_H_
diff --git a/tensorflow/compiler/jit/xla_launch_util_test.cc b/tensorflow/compiler/jit/xla_launch_util_test.cc
deleted file mode 100644
index a45932403ec..00000000000
--- a/tensorflow/compiler/jit/xla_launch_util_test.cc
+++ /dev/null
@@ -1,64 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// Contains microbenchmarks for performance critical functions in
-// xla_launch_util.cc.
-
-#include "tensorflow/compiler/jit/xla_launch_util.h"
-#include "tensorflow/compiler/tf2xla/shape_util.h"
-#include "tensorflow/core/platform/test.h"
-#include "tensorflow/core/platform/test_benchmark.h"
-
-// Test ExtractSubBuffer with different depths (depth of ShapeTree) and fan-outs
-// (cardinality of each non-leaf node's children).
-void BM_ExtractSubBuffer(int iters, int depth, int fan_out) {
-  tensorflow::testing::StopTiming();
-  xla::Shape shape = xla::ShapeUtil::MakeShape(xla::F32, {32, 64, 128});
-  for (int i = 0; i < depth; ++i) {
-    std::vector<xla::Shape> shapes(fan_out, shape);
-    shape = xla::ShapeUtil::MakeTupleShape(shapes);
-  }
-  xla::ShapedBuffer shaped_buffer(shape, shape, /*platform=*/nullptr,
-                                  /*device_ordinal=*/0);
-  tensorflow::testing::StartTiming();
-  for (int i = 0; i < iters; ++i) {
-    // Extract a buffer from approximately the middle of the first level of the
-    // tree.
-    (void)tensorflow::internal::ExtractSubShapedBuffer(&shaped_buffer,
-                                                       /*index=*/fan_out / 2,
-                                                       /*allocator=*/nullptr)
-        .release();
-  }
-}
-
-BENCHMARK(BM_ExtractSubBuffer)
-    ->ArgPair(1, 4)
-    ->ArgPair(1, 8)
-    ->ArgPair(1, 32)
-    ->ArgPair(1, 64)
-    ->ArgPair(1, 128)
-    ->ArgPair(1, 256)
-    ->ArgPair(1, 512)
-    ->ArgPair(2, 4)
-    ->ArgPair(2, 8)
-    ->ArgPair(2, 32)
-    ->ArgPair(2, 64)
-    ->ArgPair(2, 128);
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  tensorflow::testing::RunBenchmarks();
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/compiler/jit/xla_tensor.cc b/tensorflow/compiler/jit/xla_tensor.cc
index 6f8b198262d..d1f7f754c83 100644
--- a/tensorflow/compiler/jit/xla_tensor.cc
+++ b/tensorflow/compiler/jit/xla_tensor.cc
@@ -43,11 +43,10 @@ namespace tensorflow {
   }
 }
 
-Status XlaTensor::AllocateShapedBuffer(DataType dtype, const TensorShape& shape,
+Status XlaTensor::AllocateShapedBuffer(DataType dtype,
+                                       const xla::Shape& on_host_shape,
                                        xla::LocalClient* client,
                                        int device_ordinal) {
-  xla::Shape on_host_shape;
-  TF_RETURN_IF_ERROR(TensorShapeToXLAShape(dtype, shape, &on_host_shape));
   xla::Shape on_device_shape =
       client->backend().transfer_manager()->HostShapeToDeviceShape(
           on_host_shape);
diff --git a/tensorflow/compiler/jit/xla_tensor.h b/tensorflow/compiler/jit/xla_tensor.h
index 6d7a6fd66c8..77e80aa2527 100644
--- a/tensorflow/compiler/jit/xla_tensor.h
+++ b/tensorflow/compiler/jit/xla_tensor.h
@@ -50,7 +50,7 @@ class XlaTensor {
   // Assign the internal ShapedBuffer to new memory for the given dtype and
   // shape. If a ShapedBuffer exists already (has_shaped_buffer() == true), it
   // is replaced and the managed memory deallocated.
-  Status AllocateShapedBuffer(DataType dtype, const TensorShape& shape,
+  Status AllocateShapedBuffer(DataType dtype, const xla::Shape& on_host_shape,
                               xla::LocalClient* client, int device_ordinal);
 
   // Some Tensors can have complex on-device shapes, including tuple shapes. To
diff --git a/tensorflow/compiler/tests/BUILD b/tensorflow/compiler/tests/BUILD
index 6945de1eda1..6b8e6bba1e1 100644
--- a/tensorflow/compiler/tests/BUILD
+++ b/tensorflow/compiler/tests/BUILD
@@ -470,12 +470,12 @@ tf_xla_py_test(
     tags = ["optonly"],
     deps = [
         ":xla_test",
-        "//tensorflow/contrib/signal:signal_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:extra_py_tests_deps",
         "//tensorflow/python:framework",
         "//tensorflow/python:platform_test",
         "//tensorflow/python:spectral_ops",
+        "//tensorflow/python/ops/signal",
     ],
 )
 
@@ -837,8 +837,6 @@ tf_xla_py_test(
     name = "stack_ops_test",
     size = "small",
     srcs = ["stack_ops_test.py"],
-    # Stack ops are not implemented in the on-demand compilation model yet.
-    disabled_backends = ["cpu_ondemand"],
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
diff --git a/tensorflow/compiler/tests/binary_ops_test.py b/tensorflow/compiler/tests/binary_ops_test.py
index 4e6dd6abfc9..332381c59ee 100644
--- a/tensorflow/compiler/tests/binary_ops_test.py
+++ b/tensorflow/compiler/tests/binary_ops_test.py
@@ -18,6 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import itertools
+
 import numpy as np
 
 from tensorflow.compiler.tests import xla_test
@@ -967,7 +969,7 @@ class BinaryOpsTest(xla_test.XLATestCase):
       self._testBinary(
           array_ops.expand_dims,
           np.array([42], dtype=dtype),
-          np.int32(0),
+          np.array([0], dtype=np.int64),
           expected=np.array([[42]], dtype=dtype))
       self._testBinary(
           array_ops.expand_dims,
@@ -994,15 +996,21 @@ class BinaryOpsTest(xla_test.XLATestCase):
           np.array([[[1, 2], [3, 4]]], dtype=dtype),
           np.int32(3),
           expected=np.array([[[[1], [2]], [[3], [4]]]], dtype=dtype))
+      self._testBinary(
+          array_ops.expand_dims,
+          np.array([[[1, 2], [3, 4]]], dtype=dtype),
+          np.array([2], dtype=np.int64),
+          expected=np.array([[[[1, 2]], [[3, 4]]]], dtype=dtype))
 
   def testPad(self):
-    for dtype in self.numeric_types:
+    for dtype, pad_type in itertools.product(
+        self.numeric_types, [np.int32, np.int64]):
       self._testBinary(
           array_ops.pad,
           np.array(
               [[1, 2, 3], [4, 5, 6]], dtype=dtype),
           np.array(
-              [[1, 2], [2, 1]], dtype=np.int32),
+              [[1, 2], [2, 1]], dtype=pad_type),
           expected=np.array(
               [[0, 0, 0, 0, 0, 0],
                [0, 0, 1, 2, 3, 0],
@@ -1016,7 +1024,7 @@ class BinaryOpsTest(xla_test.XLATestCase):
           np.array(
               [[1, 2, 3], [4, 5, 6]], dtype=dtype),
           np.array(
-              [[0, 3], [2, 1]], dtype=np.int32),
+              [[0, 3], [2, 1]], dtype=pad_type),
           expected=np.array(
               [[7, 7, 1, 2, 3, 7],
                [7, 7, 4, 5, 6, 7],
diff --git a/tensorflow/compiler/tests/fft_test.py b/tensorflow/compiler/tests/fft_test.py
index b3e13fbaa6b..e92afd5d6fe 100644
--- a/tensorflow/compiler/tests/fft_test.py
+++ b/tensorflow/compiler/tests/fft_test.py
@@ -24,10 +24,10 @@ import numpy as np
 import scipy.signal as sps
 
 from tensorflow.compiler.tests import xla_test
-from tensorflow.contrib.signal.python.ops import spectral_ops as signal
 from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradients_impl
+from tensorflow.python.ops import signal
 from tensorflow.python.ops import spectral_ops
 from tensorflow.python.platform import googletest
 
diff --git a/tensorflow/compiler/tests/jit_test.py b/tensorflow/compiler/tests/jit_test.py
index 561715ee1c3..6f51ae33a1b 100644
--- a/tensorflow/compiler/tests/jit_test.py
+++ b/tensorflow/compiler/tests/jit_test.py
@@ -593,6 +593,67 @@ class LazyCompilationTest(test.TestCase):
       self.assertFalse(
           InLabels(RunMetadataLabels(run_metadata_for_new_shape), "_XlaRun"))
 
+  def testIsMegamorphic(self):
+
+    @function.Defun(compiled=True)
+    def CompiledFunction(x):
+      return math_ops.log(x)
+
+    with session_lib.Session(config=NoRewriteSessionConfig()) as sess:
+      x = array_ops.placeholder(dtypes.float32)
+      y = CompiledFunction(x)
+
+      # Make the cluster go megamorphic by running it with lots of shape
+      # signatures where the cluster is executed with each signature only a few
+      # times.  Then check that we don't compile the cluster ever again.
+
+      for shape in range(10, 50):
+        for _ in range(0, 49):
+          sess.run(y, feed_dict={x: [0.] * shape})
+
+      for _ in range(0, 50):
+        run_metadata = config_pb2.RunMetadata()
+        sess.run(
+            y,
+            feed_dict={x: [0.] * 60},
+            run_metadata=run_metadata,
+            options=config_pb2.RunOptions(
+                trace_level=config_pb2.RunOptions.FULL_TRACE))
+        self.assertTrue(
+            InLabels(RunMetadataLabels(run_metadata), "_XlaCompile"))
+        self.assertFalse(InLabels(RunMetadataLabels(run_metadata), "_XlaRun"))
+
+  def testIsNotMegamorphic(self):
+
+    @function.Defun(compiled=True)
+    def CompiledFunction(x):
+      return math_ops.log(x)
+
+    with session_lib.Session(config=NoRewriteSessionConfig()) as sess:
+      x = array_ops.placeholder(dtypes.float32)
+      y = CompiledFunction(x)
+
+      # Run the cluster with lots of shape signatures, but in a way that it
+      # isn't megamorphic (i.e. each shape signature sees a lot of executions).
+      # Then check that the cluster has not been marked as megamorphic.
+
+      for shape in range(10, 50):
+        for _ in range(0, 1000):
+          sess.run(y, feed_dict={x: [0.] * shape})
+
+      for _ in range(0, 10):
+        sess.run(y, feed_dict={x: [0.] * 60})
+
+      run_metadata = config_pb2.RunMetadata()
+      sess.run(
+          y,
+          feed_dict={x: [0.] * 60},
+          run_metadata=run_metadata,
+          options=config_pb2.RunOptions(
+              trace_level=config_pb2.RunOptions.FULL_TRACE))
+      self.assertTrue(InLabels(RunMetadataLabels(run_metadata), "_XlaCompile"))
+      self.assertTrue(InLabels(RunMetadataLabels(run_metadata), "_XlaRun"))
+
 
 if __name__ == "__main__":
   os.environ["TF_XLA_FLAGS"] = ("--tf_xla_enable_lazy_compilation=true " +
diff --git a/tensorflow/compiler/tests/randomized_tests.cc b/tensorflow/compiler/tests/randomized_tests.cc
index cfccf5f3d2a..a6b58020126 100644
--- a/tensorflow/compiler/tests/randomized_tests.cc
+++ b/tensorflow/compiler/tests/randomized_tests.cc
@@ -2466,20 +2466,21 @@ TEST_F(OpTest, Pack) {
   });
 }
 
-// TODO(b/31741898): crashes on GPU.
 TEST_F(OpTest, Pad) {
   Repeatedly([this]() {
     auto type = Choose<DataType>(kAllXlaTypes);
     std::vector<int64> t_dims = RandomDims();
 
-    // TODO(b/31741996): re-enable DT_INT64 when bug is fixed.
-    // DataType tpaddings = Choose<DataType>({DT_INT32, DT_INT64});
-    DataType tpaddings = DT_INT32;
+    DataType tpaddings = Choose<DataType>({DT_INT32, DT_INT64});
     std::vector<int64> paddings_vec;
-    std::uniform_int_distribution<int> distribution(0, 7);
     for (int i = 0; i < t_dims.size(); ++i) {
-      paddings_vec.push_back(distribution(generator()));
-      paddings_vec.push_back(distribution(generator()));
+      std::uniform_int_distribution<int> pad_distribution(0, t_dims[i]);
+      int pad_size = pad_distribution(generator());
+      std::uniform_int_distribution<int> lower_distribution(0, pad_size);
+      int low_pad_size = lower_distribution(generator());
+      paddings_vec.push_back(low_pad_size);
+      paddings_vec.push_back(pad_size - low_pad_size);
+      t_dims[i] -= pad_size;
     }
     Tensor paddings;
     CHECK(
diff --git a/tensorflow/compiler/tests/resampler_ops_test.py b/tensorflow/compiler/tests/resampler_ops_test.py
index d05554fdb68..f87ac3360c9 100644
--- a/tensorflow/compiler/tests/resampler_ops_test.py
+++ b/tensorflow/compiler/tests/resampler_ops_test.py
@@ -37,7 +37,7 @@ class ResamplerOpsTest(xla_test.XLATestCase):
       out = sess.run(resampled, {input_image: image_np, warp: warp_np})
 
       self.assertAllCloseAccordingToType(
-          expected, out, half_rtol=1e-2, bfloat16_rtol=3e-2)
+          expected, out, rtol=5e-3, half_rtol=1e-2, bfloat16_rtol=3e-2)
 
   def _assertBackwardOpMatchesExpected(self, input_np, warp_np, grad_output_np,
                                        expected_grad_data, expected_grad_warp):
diff --git a/tensorflow/compiler/tests/variable_ops_test.py b/tensorflow/compiler/tests/variable_ops_test.py
index dd2c252d383..77cdeac8168 100644
--- a/tensorflow/compiler/tests/variable_ops_test.py
+++ b/tensorflow/compiler/tests/variable_ops_test.py
@@ -40,6 +40,19 @@ from tensorflow.python.training.gradient_descent import GradientDescentOptimizer
 class VariableOpsTest(xla_test.XLATestCase):
   """Test cases for resource variable operators."""
 
+  def testWriteEmptyShape(self):
+    # Verifies that we can pass an uninitialized variable with an empty shape,
+    # assign it a value, and successfully return it.
+    for dtype in self.numeric_types:
+      with self.test_session() as sess, self.test_scope():
+        zeros = np.zeros([3, 0], dtype=dtype)
+        v = resource_variable_ops.ResourceVariable(zeros)
+        p = array_ops.placeholder(dtype)
+        x = v.assign(p)
+        with ops.control_dependencies([x]):
+          y = v.read_value()
+        self.assertAllClose(zeros, sess.run(y, {p: zeros}))
+
   def testOneWriteOneOutput(self):
     # Regression test for a bug where computations with one non-constant
     # output and one variable update were mishandled.
diff --git a/tensorflow/compiler/tf2xla/BUILD b/tensorflow/compiler/tf2xla/BUILD
index 5fc9a352ff9..e0171415492 100644
--- a/tensorflow/compiler/tf2xla/BUILD
+++ b/tensorflow/compiler/tf2xla/BUILD
@@ -166,6 +166,7 @@ cc_library(
         "xla_compilation_device.cc",
         "xla_compiler.cc",
         "xla_context.cc",
+        "xla_expression.cc",
         "xla_helpers.cc",
         "xla_op_kernel.cc",
         "xla_op_registry.cc",
@@ -180,6 +181,7 @@ cc_library(
         "xla_compilation_device.h",
         "xla_compiler.h",
         "xla_context.h",
+        "xla_expression.h",
         "xla_helpers.h",
         "xla_op_kernel.h",
         "xla_op_registry.h",
@@ -194,6 +196,7 @@ cc_library(
         ":side_effect_util",
         ":tf2xla_util",
         "//tensorflow/compiler/jit:xla_cluster_util",
+        "//tensorflow/compiler/jit/legacy_flags:mark_for_compilation_pass_flags",
         "//tensorflow/compiler/tf2xla/lib:util",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
@@ -217,6 +220,7 @@ cc_library(
         "//tensorflow/core:stream_executor_no_cuda",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
     ],
     alwayslink = 1,
@@ -362,8 +366,12 @@ tf_cc_test(
 
 tf_cc_test(
     name = "xla_compiler_test",
-    srcs = ["xla_compiler_test.cc"],
+    srcs = [
+        "xla_compiler_test.cc",
+        "xla_expression_test.cc",
+    ],
     deps = [
+        ":common",
         ":side_effect_util",
         ":xla_compiler",
         "//tensorflow/cc:cc_ops",
@@ -386,6 +394,7 @@ tf_cc_test(
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
+        "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -435,7 +444,7 @@ cc_library(
         "dump_graph.h",
     ],
     deps = [
-        "//tensorflow/compiler/xla/legacy_flags:parse_flags_from_env",
+        "//tensorflow/compiler/xla:parse_flags_from_env",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
diff --git a/tensorflow/compiler/tf2xla/dump_graph_flags.cc b/tensorflow/compiler/tf2xla/dump_graph_flags.cc
index a6c908ba011..2eb1f8cd849 100644
--- a/tensorflow/compiler/tf2xla/dump_graph_flags.cc
+++ b/tensorflow/compiler/tf2xla/dump_graph_flags.cc
@@ -19,7 +19,7 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/compiler/tf2xla/dump_graph_flags.h"
-#include "tensorflow/compiler/xla/legacy_flags/parse_flags_from_env.h"
+#include "tensorflow/compiler/xla/parse_flags_from_env.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/command_line_flags.h"
 
@@ -41,7 +41,7 @@ static void AllocateFlags() {
            "Path prefix to which graphs dumped during debugging should be "
            "written."),
   });
-  xla::legacy_flags::ParseFlagsFromEnv(*flag_list);
+  xla::ParseFlagsFromEnv(*flag_list);
 }
 
 // Append to *append_to flag definitions associated with the XLA bridge's
diff --git a/tensorflow/compiler/tf2xla/functionalize_control_flow.cc b/tensorflow/compiler/tf2xla/functionalize_control_flow.cc
index f818d80022d..9ef9f49f422 100644
--- a/tensorflow/compiler/tf2xla/functionalize_control_flow.cc
+++ b/tensorflow/compiler/tf2xla/functionalize_control_flow.cc
@@ -242,23 +242,20 @@ Status FunctionalizeControlFlowPass::Run(
       continue;
     }
     const string func_attr = it->second;
-    if (kNodeTypeToFunctionAttrMapping->find(n->type_string()) !=
-        kNodeTypeToFunctionAttrMapping->end()) {
-      NameAttrList func;
-      TF_RETURN_IF_ERROR(GetNodeAttr(n->attrs(), func_attr, &func));
-      VLOG(2) << "Graph has node " << n->type_string()
-              << ". Corresponding function: " << func.name();
-      string new_func_name = options.flib_def->UniqueFunctionName(
-          absl::StrCat(func.name(), "_f15n_"));
-      bool modified;
-      TF_RETURN_IF_ERROR(FunctionalizeControlFlowForFunction(
-          func.name(), new_func_name, func.attr(), options.flib_def, flr,
-          &canonicalized_name_to_new_name, &modified));
-      if (modified) {
-        n->ClearAttr(func_attr);
-        func.set_name(new_func_name);
-        n->AddAttr(func_attr, func);
-      }
+    NameAttrList func;
+    TF_RETURN_IF_ERROR(GetNodeAttr(n->attrs(), func_attr, &func));
+    VLOG(2) << "Graph has node " << n->type_string()
+            << ". Corresponding function: " << func.name();
+    string new_func_name = options.flib_def->UniqueFunctionName(
+        absl::StrCat(func.name(), "_f15n_"));
+    bool modified;
+    TF_RETURN_IF_ERROR(FunctionalizeControlFlowForFunction(
+        func.name(), new_func_name, func.attr(), options.flib_def, flr,
+        &canonicalized_name_to_new_name, &modified));
+    if (modified) {
+      n->ClearAttr(func_attr);
+      func.set_name(new_func_name);
+      n->AddAttr(func_attr, func);
     }
   }
 
diff --git a/tensorflow/compiler/tf2xla/graph_compiler.cc b/tensorflow/compiler/tf2xla/graph_compiler.cc
index 706ed4f5bbf..efb75749722 100644
--- a/tensorflow/compiler/tf2xla/graph_compiler.cc
+++ b/tensorflow/compiler/tf2xla/graph_compiler.cc
@@ -23,9 +23,9 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/literal_util.h"
 #include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/tf2xla/type_util.h"
-#include "tensorflow/compiler/tf2xla/xla_compilation_device.h"
 #include "tensorflow/compiler/tf2xla/xla_compiler.h"
 #include "tensorflow/compiler/tf2xla/xla_context.h"
+#include "tensorflow/compiler/tf2xla/xla_expression.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
@@ -40,6 +40,7 @@ limitations under the License.
 #include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/node_builder.h"
 #include "tensorflow/core/graph/validate.h"
+#include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/lib/hash/hash.h"
 #include "tensorflow/core/platform/logging.h"
@@ -51,12 +52,11 @@ namespace {
 Status PrepareArguments(XlaOpKernelContext* ctx, Graph* graph,
                         const std::vector<const XlaExpression*>& expressions,
                         std::vector<XlaCompiler::Argument>* args) {
-  auto builder = ctx->builder();
   auto client = ctx->compiler()->client();
-  std::vector<bool> compile_time_constant_flags(expressions.size());
+  std::vector<bool> arg_must_be_compile_time_constant(expressions.size());
 
   TF_RETURN_IF_ERROR(
-      BackwardsConstAnalysis(*graph, &compile_time_constant_flags,
+      BackwardsConstAnalysis(*graph, &arg_must_be_compile_time_constant,
                              /*compile_time_const_nodes=*/nullptr));
 
   args->resize(expressions.size());
@@ -65,24 +65,31 @@ Status PrepareArguments(XlaOpKernelContext* ctx, Graph* graph,
     arg.type = ctx->input_type(i);
     arg.shape = ctx->InputShape(i);
 
-    if (arg.type == DT_RESOURCE) {
-      return errors::InvalidArgument(
-          "Resource as function argument is not yet implemented.");
-    } else if (expressions[i]->has_constant_value()) {
-      arg.kind = XlaCompiler::Argument::kConstant;
-      arg.constant_value = expressions[i]->constant_value();
-    } else if (compile_time_constant_flags[i]) {
-      arg.kind = XlaCompiler::Argument::kConstant;
-      TF_RET_CHECK(expressions[i]->resource() == nullptr)
-          << "Input with resource is not yet implemented.";
-      TF_ASSIGN_OR_RETURN(auto constant_graph, builder->BuildConstantSubGraph(
-                                                   expressions[i]->handle()));
-      TF_ASSIGN_OR_RETURN(auto literal,
-                          client->ComputeConstant(constant_graph));
-      TF_RETURN_IF_ERROR(
-          LiteralToHostTensor(literal, arg.type, &arg.constant_value));
-    } else {
-      arg.kind = XlaCompiler::Argument::kParameter;
+    switch (expressions[i]->kind()) {
+      case XlaExpression::Kind::kConstant:
+        arg.kind = XlaCompiler::Argument::kConstant;
+        arg.constant_value = expressions[i]->constant_value();
+        break;
+      case XlaExpression::Kind::kXlaOp:
+        if (arg_must_be_compile_time_constant[i]) {
+          TF_ASSIGN_OR_RETURN(absl::optional<Tensor> value,
+                              expressions[i]->ResolveConstant(client));
+          if (!value.has_value()) {
+            return errors::InvalidArgument(
+                "Argument to function must be a compile-time constant, but "
+                "unable to resolve argument value to a constant.");
+          }
+          arg.kind = XlaCompiler::Argument::kConstant;
+          arg.constant_value = *value;
+        } else {
+          arg.kind = XlaCompiler::Argument::kParameter;
+        }
+        break;
+      case XlaExpression::Kind::kResource:
+        return errors::Unimplemented(
+            "Resource as function argument is not yet implemented.");
+      case XlaExpression::Kind::kInvalid:
+        return errors::InvalidArgument("Invalid function argument");
     }
   }
   return Status::OK();
diff --git a/tensorflow/compiler/tf2xla/kernels/arg_op.cc b/tensorflow/compiler/tf2xla/kernels/arg_op.cc
index 276d744c096..2db2514397d 100644
--- a/tensorflow/compiler/tf2xla/kernels/arg_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/arg_op.cc
@@ -14,11 +14,13 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/compiler/tf2xla/type_util.h"
+#include "tensorflow/compiler/tf2xla/xla_compilation_device.h"
 #include "tensorflow/compiler/tf2xla/xla_compiler.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
+#include "tensorflow/core/lib/core/errors.h"
 
 namespace tensorflow {
 
@@ -49,13 +51,9 @@ class XlaArgOp : public XlaOpKernel {
     }
 
     const XlaExpression& arg = XlaContext::Get(ctx).args()[index_];
-    if (arg.resource() != nullptr) {
-      ctx->SetResourceOutput(0, arg.resource());
-    } else if (arg.has_constant_value()) {
-      ctx->SetConstantOutput(0, arg.constant_value());
-    } else {
-      ctx->SetOutput(0, arg.handle());
-    }
+    OP_REQUIRES(ctx, arg.kind() != XlaExpression::Kind::kInvalid,
+                errors::InvalidArgument("Invalid/missing argument expression"));
+    ctx->SetOutputExpression(0, arg);
   }
 
  private:
diff --git a/tensorflow/compiler/tf2xla/kernels/bcast_ops.cc b/tensorflow/compiler/tf2xla/kernels/bcast_ops.cc
index 9fa57b76f8e..c022284fec6 100644
--- a/tensorflow/compiler/tf2xla/kernels/bcast_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/bcast_ops.cc
@@ -94,14 +94,10 @@ class BCastGradArgsOp : public XlaOpKernel {
       OP_REQUIRES(ctx, TensorShapeUtils::IsVector(in_shape),
                   errors::InvalidArgument("In[", i, "] must be a vector.",
                                           in_shape.DebugString()));
-      xla::Literal literal;
-      OP_REQUIRES_OK(ctx, ctx->ConstantInput(i, &literal));
+      std::vector<int64> vec;
+      OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntVector(i, &vec));
 
-      BCast::Vec vec;
-      for (int64 i = 0; i < in_shape.num_elements(); ++i) {
-        vec.push_back(literal.Get<int>({i}));
-      }
-      shapes.push_back(vec);
+      shapes.push_back(BCast::Vec(vec.begin(), vec.end()));
     }
     BCast bcast(shapes[0], shapes[1]);
     OP_REQUIRES(ctx, bcast.IsValid(),
diff --git a/tensorflow/compiler/tf2xla/kernels/concat_op.cc b/tensorflow/compiler/tf2xla/kernels/concat_op.cc
index e28755dd73b..cd7c7f4a82d 100644
--- a/tensorflow/compiler/tf2xla/kernels/concat_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/concat_op.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/kernels/bounds_check.h"
@@ -45,15 +46,13 @@ class ConcatBaseOp : public XlaOpKernel {
 
   void Compile(XlaOpKernelContext* ctx) override {
     const TensorShape concat_dim_tensor_shape = ctx->InputShape(axis_index_);
-    OP_REQUIRES(
-        ctx, IsLegacyScalar(concat_dim_tensor_shape),
-        errors::InvalidArgument(
-            "Concat dim tensor should be a scalar integer, but got shape ",
-            concat_dim_tensor_shape.DebugString()));
-    xla::Literal literal;
-    OP_REQUIRES_OK(ctx, ctx->ConstantInput(axis_index_, &literal));
-    // TODO(annarev): add a helper to support int64 input.
-    const int32 concat_dim = literal.Get<int>({});
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(concat_dim_tensor_shape),
+                errors::InvalidArgument(
+                    "Concat dim tensor should be a scalar, but got shape ",
+                    concat_dim_tensor_shape.DebugString()));
+    int64 concat_dim;
+    OP_REQUIRES_OK(ctx,
+                   ctx->ConstantInputAsIntScalar(axis_index_, &concat_dim));
 
     std::vector<xla::XlaOp> values;
     std::vector<TensorShape> shapes;
@@ -63,9 +62,7 @@ class ConcatBaseOp : public XlaOpKernel {
     const TensorShape& input_shape = shapes[0];
 
     int32 axis = concat_dim < 0 ? concat_dim + input_dims : concat_dim;
-    OP_REQUIRES(ctx,
-                (0 <= axis && axis < input_dims) ||
-                    (allow_legacy_scalars() && concat_dim == 0),
+    OP_REQUIRES(ctx, 0 <= axis && axis < input_dims,
                 errors::InvalidArgument(
                     "ConcatOp : Expected concatenating dimensions in the range "
                     "[",
@@ -75,14 +72,11 @@ class ConcatBaseOp : public XlaOpKernel {
     // elements.
     std::vector<xla::XlaOp> input_data;
     int output_concat_dim = 0;
-    const bool input_is_scalar = IsLegacyScalar(input_shape);
     for (int i = 0; i < N; ++i) {
       xla::XlaOp handle = values[i];
       const TensorShape& in_shape = shapes[i];
-      const bool in_is_scalar = IsLegacyScalar(in_shape);
       OP_REQUIRES(
-          ctx,
-          in_shape.dims() == input_dims || (input_is_scalar && in_is_scalar),
+          ctx, in_shape.dims() == input_dims,
           errors::InvalidArgument(
               "ConcatOp : Ranks of all input tensors should match: shape[0] = ",
               input_shape.DebugString(), " vs. shape[", i,
@@ -131,11 +125,10 @@ class ConcatOffsetOp : public XlaOpKernel {
 
   void Compile(XlaOpKernelContext* ctx) override {
     const TensorShape concat_dim_shape = ctx->InputShape(0);
-    OP_REQUIRES(
-        ctx, IsLegacyScalar(concat_dim_shape),
-        errors::InvalidArgument(
-            "Concat dim tensor should be a scalar integer, but got shape ",
-            concat_dim_shape.DebugString()));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(concat_dim_shape),
+                errors::InvalidArgument(
+                    "Concat dim tensor should be a scalar, but got shape ",
+                    concat_dim_shape.DebugString()));
     for (int i = 1; i < ctx->num_inputs(); ++i) {
       OP_REQUIRES(ctx, TensorShapeUtils::IsVector(ctx->InputShape(i)),
                   errors::InvalidArgument("input ", i,
@@ -162,39 +155,38 @@ class ConcatOffsetOp : public XlaOpKernel {
     //  [0, 5, 0, 0]
     const int32 N = ctx->num_inputs() - 1;
     const TensorShape inp0_shape = ctx->InputShape(1);
-    xla::Literal inp0_literal;
-    OP_REQUIRES_OK(ctx, ctx->ConstantInput(1, &inp0_literal));
-    const int64 dims = inp0_shape.num_elements();
+    std::vector<int64> inp0_dims;
+    OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntVector(1, &inp0_dims));
+    const int64 inp0_rank = inp0_shape.num_elements();
 
-    xla::Literal concat_dim_literal;
-    OP_REQUIRES_OK(ctx, ctx->ConstantInput(0, &concat_dim_literal));
-    const int64 cdim = concat_dim_literal.Get<int>({});
+    int64 cdim;
+    OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntScalar(0, &cdim));
 
-    VLOG(1) << "ConcatOffset " << cdim << "," << dims;
-    int32 axis = cdim < 0 ? cdim + dims : cdim;
-    OP_REQUIRES(ctx, FastBoundsCheck(axis, dims),
+    VLOG(1) << "ConcatOffset " << cdim << "," << inp0_rank;
+    int32 axis = cdim < 0 ? cdim + inp0_rank : cdim;
+    OP_REQUIRES(ctx, FastBoundsCheck(axis, inp0_rank),
                 errors::InvalidArgument("Concat dim is out of range: ", axis,
-                                        " vs. ", dims));
+                                        " vs. ", inp0_rank));
     int32 offset = 0;
     for (int i = 0; i < N; ++i) {
       const TensorShape inp_shape = ctx->InputShape(1 + i);
-      OP_REQUIRES(ctx, dims == inp_shape.num_elements(),
-                  errors::InvalidArgument("input ", i, " should contain ", dims,
-                                          " elements, but got ",
+      OP_REQUIRES(ctx, inp0_rank == inp_shape.num_elements(),
+                  errors::InvalidArgument("input ", i, " should contain ",
+                                          inp0_rank, " elements, but got ",
                                           inp_shape.num_elements()));
-      xla::Literal inp_literal;
-      OP_REQUIRES_OK(ctx, ctx->ConstantInput(1 + i, &inp_literal));
+      std::vector<int64> inp_dims;
+      OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntVector(1 + i, &inp_dims));
 
-      Tensor out_constant(DT_INT32, TensorShape({dims}));
+      Tensor out_constant(DT_INT32, TensorShape({inp0_rank}));
       auto out_vec = out_constant.vec<int32>();
-      for (int64 j = 0; j < dims; ++j) {
+      for (int64 j = 0; j < inp0_rank; ++j) {
         if (j == axis) {
           out_vec(j) = offset;
-          offset += inp_literal.Get<int>({j});
+          offset += inp_dims[j];
         } else {
-          const int32 inp0_element = inp0_literal.Get<int>({j});
-          const int32 inp_element = inp_literal.Get<int>({j});
-          OP_REQUIRES(ctx, (inp0_element == inp_element),
+          const int32 inp0_element = inp0_dims[j];
+          const int32 inp_element = inp_dims[j];
+          OP_REQUIRES(ctx, inp0_element == inp_element,
                       errors::InvalidArgument("input[", i, ",", j,
                                               "] mismatch: ", inp0_element,
                                               " vs. ", inp_element));
diff --git a/tensorflow/compiler/tf2xla/kernels/const_op.cc b/tensorflow/compiler/tf2xla/kernels/const_op.cc
index 2628ef8e245..dff8af80022 100644
--- a/tensorflow/compiler/tf2xla/kernels/const_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/const_op.cc
@@ -42,11 +42,6 @@ class ConstOp : public XlaOpKernel {
   void Compile(XlaOpKernelContext* ctx) override {
     TensorShape shape(proto_.tensor_shape());
 
-    if (proto_.dtype() == DT_STRING) {
-      LOG(WARNING) << "Not computing Const of type DT_STRING";
-      ctx->SetInvalidOutput(0);
-      return;
-    }
     xla::XlaBuilder* b = ctx->builder();
 
     // To avoid blowups for large constants filled with the same value,
diff --git a/tensorflow/compiler/tf2xla/kernels/fill_op.cc b/tensorflow/compiler/tf2xla/kernels/fill_op.cc
index e9bdb15aa0c..35e0625dbb0 100644
--- a/tensorflow/compiler/tf2xla/kernels/fill_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/fill_op.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor_shape.h"
 
 namespace tensorflow {
 namespace {
@@ -33,39 +34,20 @@ class FillOp : public XlaOpKernel {
   void Compile(XlaOpKernelContext* ctx) override {
     // The output of this Op is a tensor of shape 'dims_shape' with each
     // element set to the scalar 'dims_literal'.
-    const TensorShape dims_shape = ctx->InputShape(0);
-    const TensorShape value_shape = ctx->InputShape(1);
+    const TensorShape dims_shape = ctx->InputShape("dims");
+    const TensorShape value_shape = ctx->InputShape("value");
     OP_REQUIRES(
-        ctx, IsLegacyVector(dims_shape),
+        ctx, TensorShapeUtils::IsVector(dims_shape),
         errors::InvalidArgument("dims must be a vector of int32, got shape ",
                                 dims_shape.DebugString()));
-    OP_REQUIRES(ctx, IsLegacyScalar(value_shape),
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(value_shape),
                 errors::InvalidArgument("value must be a scalar, got shape ",
                                         value_shape.DebugString()));
-    // Evaluate the 'dims' constant input, reshaping to a vector if it
-    // was a 'legacy' vector (secretly a scalar).
-    xla::Literal dims_literal;
-    OP_REQUIRES_OK(ctx, ctx->ConstantInputReshaped(
-                            0, {dims_shape.num_elements()}, &dims_literal));
 
-    // Convert the dims literal into a vector that we can pass to
-    // XlaBuilder.
-    std::vector<int64> broadcast;
-    broadcast.reserve(dims_literal.shape().dimensions(0));
-    for (int i = 0; i < dims_literal.shape().dimensions(0); ++i) {
-      broadcast.push_back(dims_literal.Get<int>({i}));
-    }
-    // Look up the value input, reshaping to a scalar if it was a
-    // 'legacy' scalar (secretly a vector).
-    xla::XlaOp data = ctx->Input(1);
-    if (value_shape.dims() > 0) {
-      CHECK_EQ(value_shape.dims(), 1);
-      data = xla::Reshape(data, {});
-    }
-    // Emit the actual computation, which broadcasts the scalar to the
-    // desired shape.
-    auto result = xla::Broadcast(data, broadcast);
+    std::vector<int64> dims;
+    OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntVector("dims", &dims));
 
+    auto result = xla::Broadcast(ctx->Input("value"), dims);
     ctx->SetOutput(0, result);
   }
 };
diff --git a/tensorflow/compiler/tf2xla/kernels/index_ops_cpu.cc b/tensorflow/compiler/tf2xla/kernels/index_ops_cpu.cc
index d069373086a..e310db2162d 100644
--- a/tensorflow/compiler/tf2xla/kernels/index_ops_cpu.cc
+++ b/tensorflow/compiler/tf2xla/kernels/index_ops_cpu.cc
@@ -48,9 +48,8 @@ class ArgMaxCustomCallOp : public XlaOpKernel {
     // We require that the dimension argument is a constant, since it lets us
     // dispatch to a specialized custom-call function without any run-time
     // overhead, when compiling ahead-of-time.
-    xla::Literal literal;
-    OP_REQUIRES_OK(ctx, ctx->ConstantInput(1, &literal));
-    const int32 dim = literal.Get<int32>({});
+    int64 dim;
+    OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntScalar(1, &dim));
     OP_REQUIRES(ctx, dim >= 0, errors::InvalidArgument("dim must be >= 0"));
     OP_REQUIRES(
         ctx, dim < input_shape.dims(),
diff --git a/tensorflow/compiler/tf2xla/kernels/mirror_pad_op.cc b/tensorflow/compiler/tf2xla/kernels/mirror_pad_op.cc
index 4833a9662dd..f6b8534f4d7 100644
--- a/tensorflow/compiler/tf2xla/kernels/mirror_pad_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/mirror_pad_op.cc
@@ -41,10 +41,8 @@ class MirrorPadOp : public XlaOpKernel {
     for (int64 dimno = xla::ShapeUtil::Rank(original_shape) - 1; dimno >= 0;
          --dimno) {
       auto t_rev = xla::Rev(accum, {dimno});
-      TF_ASSIGN_OR_RETURN(int64 lhs_padding,
-                          pad_literal.GetIntegralAsS64({dimno, 0}));
-      TF_ASSIGN_OR_RETURN(int64 rhs_padding,
-                          pad_literal.GetIntegralAsS64({dimno, 1}));
+      int64 lhs_padding = pad_literal.Get<int64>({dimno, 0});
+      int64 rhs_padding = pad_literal.Get<int64>({dimno, 1});
       int64 dim_size = original_shape.dimensions(dimno);
 
       // Padding amounts on each side must be no more than the size of the
@@ -65,8 +63,8 @@ class MirrorPadOp : public XlaOpKernel {
   }
 
   void Compile(XlaOpKernelContext* ctx) override {
-    const TensorShape input_shape = ctx->InputShape(0);
-    const TensorShape pad_shape = ctx->InputShape(1);
+    const TensorShape input_shape = ctx->InputShape("input");
+    const TensorShape pad_shape = ctx->InputShape("paddings");
 
     MirrorPadMode mode;
     OP_REQUIRES_OK(ctx, GetNodeAttr(def(), "mode", &mode));
@@ -81,23 +79,19 @@ class MirrorPadOp : public XlaOpKernel {
         TensorShapeUtils::IsMatrix(pad_shape) && pad_shape.dim_size(1) == 2,
         errors::InvalidArgument("paddings must be a matrix with 2 columns: ",
                                 pad_shape.DebugString()));
-    const int fixed_dims =
-        (allow_legacy_scalars() && dims == 0 && pad_shape.dim_size(0) == 1)
-            ? 1
-            : dims;
     OP_REQUIRES(
-        ctx, fixed_dims == pad_shape.dim_size(0),
+        ctx, dims == pad_shape.dim_size(0),
         errors::InvalidArgument(
             "The first dimension of paddings must be the rank of inputs",
             pad_shape.DebugString(), " ", input_shape.DebugString()));
 
     // Evaluate the 'padding' constant input, reshaping to a matrix.
     xla::Literal pad_literal;
-    OP_REQUIRES_OK(
-        ctx, ctx->ConstantInputReshaped(1, {fixed_dims, 2}, &pad_literal));
+    OP_REQUIRES_OK(ctx,
+                   ctx->ConstantInputAsInt64Literal("paddings", &pad_literal));
 
     xla::XlaBuilder* b = ctx->builder();
-    auto in0 = ctx->Input(0);
+    auto in0 = ctx->Input("input");
     xla::StatusOr<xla::Shape> in0_shape = b->GetShape(in0);
     OP_REQUIRES(ctx, in0_shape.ok(), in0_shape.status());
     xla::StatusOr<xla::XlaOp> accum_status =
diff --git a/tensorflow/compiler/tf2xla/kernels/pad_op.cc b/tensorflow/compiler/tf2xla/kernels/pad_op.cc
index 3f5445b4821..36ea70ac392 100644
--- a/tensorflow/compiler/tf2xla/kernels/pad_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/pad_op.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor_shape.h"
 
 namespace tensorflow {
 namespace {
@@ -29,40 +30,36 @@ class PadOp : public XlaOpKernel {
   explicit PadOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
 
   void Compile(XlaOpKernelContext* ctx) override {
-    const TensorShape input_shape = ctx->InputShape(0);
-    const TensorShape pad_shape = ctx->InputShape(1);
+    const TensorShape input_shape = ctx->InputShape("input");
+    const TensorShape pad_shape = ctx->InputShape("paddings");
     const int dims = input_shape.dims();
     OP_REQUIRES(
         ctx,
         TensorShapeUtils::IsMatrix(pad_shape) && pad_shape.dim_size(1) == 2,
         errors::InvalidArgument("paddings must be a matrix with 2 columns: ",
                                 pad_shape.DebugString()));
-    const int fixed_dims =
-        (allow_legacy_scalars() && dims == 0 && pad_shape.dim_size(0) == 1)
-            ? 1
-            : dims;
     OP_REQUIRES(
-        ctx, fixed_dims == pad_shape.dim_size(0),
+        ctx, dims == pad_shape.dim_size(0),
         errors::InvalidArgument(
             "The first dimension of paddings must be the rank of inputs",
             pad_shape.DebugString(), " ", input_shape.DebugString()));
 
-    if (fixed_dims == 0) {
+    xla::XlaOp input = ctx->Input("input");
+    if (dims == 0) {
       // Tensor is rank 0. Return it unchanged.
-      ctx->SetOutput(0, ctx->Input(0));
+      ctx->SetOutput(0, input);
       return;
     }
 
-    // Evaluate the 'padding' constant input, reshaping to a matrix.
     xla::Literal pad_literal;
-    OP_REQUIRES_OK(
-        ctx, ctx->ConstantInputReshaped(1, {fixed_dims, 2}, &pad_literal));
+    OP_REQUIRES_OK(ctx,
+                   ctx->ConstantInputAsInt64Literal("paddings", &pad_literal));
 
     xla::PaddingConfig config;
-    for (int i = 0; i < fixed_dims; ++i) {
+    for (int i = 0; i < dims; ++i) {
       auto* dim = config.add_dimensions();
-      int before = pad_literal.Get<int32>({i, 0});
-      int after = pad_literal.Get<int32>({i, 1});
+      int before = pad_literal.Get<int64>({i, 0});
+      int after = pad_literal.Get<int64>({i, 1});
       OP_REQUIRES(ctx, before >= 0 && after >= 0,
                   errors::InvalidArgument(
                       "Paddings must be non-negative: ", before, " ", after));
@@ -73,12 +70,13 @@ class PadOp : public XlaOpKernel {
     // PadV2 added a "constant_values" input that indicates the pad value.
     xla::XlaOp constant_values;
     if (ctx->num_inputs() == 3) {
-      OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(ctx->InputShape(2)),
-                  errors::InvalidArgument("constant_values must be a scalar."));
-      ctx->SetOutput(0, xla::Pad(ctx->Input(0), ctx->Input(2), config));
+      OP_REQUIRES(
+          ctx, TensorShapeUtils::IsScalar(ctx->InputShape("constant_values")),
+          errors::InvalidArgument("constant_values must be a scalar."));
+      ctx->SetOutput(0, xla::Pad(input, ctx->Input("constant_values"), config));
     } else {
       auto zero = XlaHelpers::Zero(ctx->builder(), input_type(0));
-      ctx->SetOutput(0, xla::Pad(ctx->Input(0), zero, config));
+      ctx->SetOutput(0, xla::Pad(input, zero, config));
     }
   }
 };
diff --git a/tensorflow/compiler/tf2xla/kernels/reshape_op.cc b/tensorflow/compiler/tf2xla/kernels/reshape_op.cc
index 47a4eac2066..fa1b6b91710 100644
--- a/tensorflow/compiler/tf2xla/kernels/reshape_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/reshape_op.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
 
 namespace tensorflow {
 namespace {
@@ -36,7 +37,7 @@ class ReshapeOp : public XlaOpKernel {
     const TensorShape input_shape = ctx->InputShape(0);
     const TensorShape sizes_shape = ctx->InputShape(1);
     // Preliminary validation of sizes.
-    OP_REQUIRES(ctx, IsLegacyVector(sizes_shape),
+    OP_REQUIRES(ctx, TensorShapeUtils::IsVector(sizes_shape),
                 errors::InvalidArgument("sizes input must be 1-D, not shape ",
                                         sizes_shape.DebugString()));
     const int64 num_dims = sizes_shape.num_elements();
diff --git a/tensorflow/compiler/tf2xla/kernels/retval_op.cc b/tensorflow/compiler/tf2xla/kernels/retval_op.cc
index e172c649325..6970dd0a006 100644
--- a/tensorflow/compiler/tf2xla/kernels/retval_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/retval_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/tf2xla/xla_context.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
@@ -46,61 +47,8 @@ class RetvalOp : public XlaOpKernel {
       // compilation.
       OP_REQUIRES_OK(ctx, frame->SetRetval(index_, input));
     } else {
-      xla::XlaOp input = ctx->Input(0);
-      const TensorShape input_shape = ctx->InputShape(0);
-      DataType input_type = ctx->input_type(0);
-      XlaContext& tc = XlaContext::Get(ctx);
-
-      if (input_type == DT_RESOURCE) {
-        XlaResource* resource;
-        OP_REQUIRES_OK(ctx, ctx->GetResourceInput(0, &resource));
-        ctx->SetStatus(tc.AddResourceRetval(index_, resource));
-        return;
-      }
-
-      auto is_constant = ctx->builder()->IsConstant(input);
-      if (!is_constant.ok()) {
-        ctx->SetStatus(is_constant.status());
-        return;
-      }
-
-      if (tc.resolve_compile_time_constants() &&
-          (input_shape.num_elements() == 0 || is_constant.ValueOrDie())) {
-        xla::Literal literal;
-        OP_REQUIRES_OK(ctx, ctx->ConstantInput(0, &literal));
-        OP_REQUIRES_OK(ctx, tc.AddConstRetval(index_, dtype_, literal));
-      } else {
-        TensorShape shape = ctx->InputShape(0);
-        ctx->SetStatus(is_constant.status());
-        TensorShape representation_shape;
-        if (tc.is_entry_computation()) {
-          xla::StatusOr<TensorShape> shape_or_status =
-              tc.RepresentationShape(shape, ctx->input_type(0));
-          if (!shape_or_status.ok()) {
-            ctx->SetStatus(shape_or_status.status());
-            return;
-          } else {
-            representation_shape = shape_or_status.ValueOrDie();
-          }
-        } else {
-          representation_shape = shape;
-        }
-
-        xla::XlaOp output = input;
-        if (tc.is_entry_computation()) {
-          output = xla::Reshape(input, representation_shape.dim_sizes());
-        } else {
-          // The core from which a return value is returned depends on the
-          // device assignment of the input to the retval. Since we can't change
-          // the device assignment of "input" at this point, we must always
-          // introduce an operator here, even if the shape does not change.
-          // TODO(b/76097077): propagate device assignments onto arguments and
-          // return values of functions, and then reshape unconditionally.
-          output =
-              xla::GetTupleElement(xla::Tuple(ctx->builder(), {output}), 0);
-        }
-        tc.AddRetval(index_, dtype_, shape, output);
-      }
+      XlaContext& xla_context = XlaContext::Get(ctx);
+      xla_context.SetRetval(index_, ctx->InputExpression(0));
     }
   }
 
diff --git a/tensorflow/compiler/tf2xla/kernels/reverse_op.cc b/tensorflow/compiler/tf2xla/kernels/reverse_op.cc
index 56b80cb4a29..2ceadaf79c5 100644
--- a/tensorflow/compiler/tf2xla/kernels/reverse_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/reverse_op.cc
@@ -51,14 +51,11 @@ class ReverseOp : public XlaOpKernel {
     }
     // XlaBuilder::Rev() requires concrete values for dimensions arg.
     xla::Literal lax;
-    OP_REQUIRES_OK(ctx, ctx->ConstantInputReshaped(1, {x_shape.dims()}, &lax));
-    std::vector<bool> revdims(x_shape.dims());
-    std::copy(lax.data<bool>().begin(), lax.data<bool>().end(),
-              revdims.begin());
-    std::vector<int64> dimensions;
+    OP_REQUIRES_OK(ctx, ctx->ConstantInput(1, &lax));
 
+    std::vector<int64> dimensions;
     for (int d = 0; d < x_shape.dims(); ++d) {
-      if (revdims[d]) {
+      if (lax.Get<bool>({d})) {
         dimensions.push_back(d);
       }
     }
diff --git a/tensorflow/compiler/tf2xla/kernels/sequence_ops.cc b/tensorflow/compiler/tf2xla/kernels/sequence_ops.cc
index 379f4aeb0fc..60b011ba6d9 100644
--- a/tensorflow/compiler/tf2xla/kernels/sequence_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/sequence_ops.cc
@@ -30,31 +30,6 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
-template <typename T>
-Status GetValue(int index, XlaOpKernelContext* ctx, T* value) {
-  xla::Literal literal;
-  TF_RETURN_IF_ERROR(ctx->ConstantInput(index, &literal));
-  *value = literal.Get<T>({});
-  return Status::OK();
-}
-
-Status GetIntValue(int index, XlaOpKernelContext* ctx, int64* value) {
-  xla::Literal literal;
-  TF_RETURN_IF_ERROR(ctx->ConstantInput(index, &literal));
-  switch (literal.shape().element_type()) {
-    case xla::S32:
-      *value = literal.Get<int32>({});
-      break;
-    case xla::S64:
-      *value = literal.Get<int64>({});
-      break;
-    default:
-      return errors::InvalidArgument("Invalid argument type for argument",
-                                     index);
-  }
-  return Status::OK();
-}
-
 // The type-specific part of the implementation of Range.
 template <typename T>
 xla::StatusOr<xla::XlaOp> CreateRangeTensor(
@@ -98,13 +73,13 @@ class RangeOp : public XlaOpKernel {
     const TensorShape start_in_shape = ctx->InputShape(0);
     const TensorShape limit_in_shape = ctx->InputShape(1);
     const TensorShape delta_in_shape = ctx->InputShape(2);
-    OP_REQUIRES(ctx, IsLegacyScalar(start_in_shape),
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(start_in_shape),
                 errors::InvalidArgument("start must be a scalar, not shape ",
                                         start_in_shape.DebugString()));
-    OP_REQUIRES(ctx, IsLegacyScalar(limit_in_shape),
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(limit_in_shape),
                 errors::InvalidArgument("limit must be a scalar, not shape ",
                                         limit_in_shape.DebugString()));
-    OP_REQUIRES(ctx, IsLegacyScalar(delta_in_shape),
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(delta_in_shape),
                 errors::InvalidArgument("delta must be a scalar, not shape ",
                                         delta_in_shape.DebugString()));
     xla::Literal start, limit, delta;
@@ -147,9 +122,9 @@ class LinSpaceOp : public XlaOpKernel {
   explicit LinSpaceOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
 
   void Compile(XlaOpKernelContext* ctx) override {
-    const TensorShape start_in_shape = ctx->InputShape(0);
-    const TensorShape stop_in_shape = ctx->InputShape(1);
-    const TensorShape num_in_shape = ctx->InputShape(2);
+    const TensorShape start_in_shape = ctx->InputShape("start");
+    const TensorShape stop_in_shape = ctx->InputShape("stop");
+    const TensorShape num_in_shape = ctx->InputShape("num");
     OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(start_in_shape),
                 errors::InvalidArgument("start must be a scalar, not shape ",
                                         start_in_shape.DebugString()));
@@ -163,16 +138,20 @@ class LinSpaceOp : public XlaOpKernel {
     DataType type = ctx->input_type(0);
 
     int64 num;
-    OP_REQUIRES_OK(ctx, GetIntValue(2, ctx, &num));
+    OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntScalar("num", &num));
     OP_REQUIRES(ctx, num > 0,
                 errors::InvalidArgument("Requires num > 0: ", num));
     Tensor out_constant(type, TensorShape({num}));
 
+    xla::Literal start_literal;
+    OP_REQUIRES_OK(ctx, ctx->ConstantInput("start", &start_literal));
+    xla::Literal stop_literal;
+    OP_REQUIRES_OK(ctx, ctx->ConstantInput("stop", &stop_literal));
+
     switch (type) {
       case DT_FLOAT: {
-        float start, stop;
-        OP_REQUIRES_OK(ctx, GetValue(0, ctx, &start));
-        OP_REQUIRES_OK(ctx, GetValue(1, ctx, &stop));
+        float start = start_literal.GetFirstElement<float>();
+        float stop = stop_literal.GetFirstElement<float>();
         auto flat = out_constant.flat<float>();
         if (num == 1) {
           flat(0) = start;
@@ -185,9 +164,8 @@ class LinSpaceOp : public XlaOpKernel {
         break;
       }
       case DT_DOUBLE: {
-        double start, stop;
-        OP_REQUIRES_OK(ctx, GetValue(0, ctx, &start));
-        OP_REQUIRES_OK(ctx, GetValue(1, ctx, &stop));
+        double start = start_literal.GetFirstElement<double>();
+        double stop = stop_literal.GetFirstElement<double>();
         auto flat = out_constant.flat<double>();
         if (num == 1) {
           flat(0) = start;
diff --git a/tensorflow/compiler/tf2xla/kernels/shape_op.cc b/tensorflow/compiler/tf2xla/kernels/shape_op.cc
index 37b026aeb05..12830816ec1 100644
--- a/tensorflow/compiler/tf2xla/kernels/shape_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/shape_op.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
+#include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/kernels/bounds_check.h"
 
 namespace tensorflow {
@@ -108,21 +109,16 @@ class ExpandDimsOp : public XlaOpKernel {
   explicit ExpandDimsOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
 
   void Compile(XlaOpKernelContext* ctx) override {
-    const TensorShape input_shape = ctx->InputShape(0);
-    const TensorShape dim_shape = ctx->InputShape(1);
+    const TensorShape input_shape = ctx->InputShape("input");
+    const TensorShape dim_shape = ctx->InputShape("dim");
 
-    // TODO(phawkins): the standard implementation of ExpandDimsOp seems to
-    // accept legacy scalars, even when they should be forbidden by the graphdef
-    // version.
-    OP_REQUIRES(ctx, dim_shape.num_elements() == 1,
+    std::vector<int64> dims;
+    OP_REQUIRES_OK(ctx, ctx->ConstantInputReshapedToIntVector("dim", &dims));
+    OP_REQUIRES(ctx, dims.size() == 1,
                 errors::InvalidArgument(absl::StrCat(
                     "dim input to ExpandDims must be a scalar; got ",
                     dim_shape.DebugString())));
-
-    xla::Literal literal;
-    OP_REQUIRES_OK(ctx, ctx->ConstantInputReshaped(1, {1}, &literal));
-
-    int dim = literal.data<int32>()[0];
+    int dim = dims[0];
 
     OP_REQUIRES(ctx,
                 (dim >= -1 - input_shape.dims() && dim <= input_shape.dims()),
@@ -148,7 +144,7 @@ class ExpandDimsOp : public XlaOpKernel {
     dim = std::min<int32>(dim, existing_dims_size);
     new_shape.emplace(new_shape.begin() + dim, 1);
 
-    ctx->SetOutput(0, xla::Reshape(ctx->Input(0), new_shape));
+    ctx->SetOutput(0, xla::Reshape(ctx->Input("input"), new_shape));
   }
 };
 REGISTER_XLA_OP(Name("ExpandDims").CompileTimeConstantInput("dim"),
diff --git a/tensorflow/compiler/tf2xla/kernels/slice_op.cc b/tensorflow/compiler/tf2xla/kernels/slice_op.cc
index 34980ead818..88da64e5a21 100644
--- a/tensorflow/compiler/tf2xla/kernels/slice_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/slice_op.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/mem.h"
@@ -42,8 +43,8 @@ class SliceOp : public XlaOpKernel {
 
     OP_REQUIRES(
         ctx,
-        IsLegacyVector(begin_tensor_shape) &&
-            IsLegacyVector(size_tensor_shape) &&
+        TensorShapeUtils::IsVector(begin_tensor_shape) &&
+            TensorShapeUtils::IsVector(size_tensor_shape) &&
             begin_tensor_shape.num_elements() == input_shape.dims() &&
             size_tensor_shape.num_elements() == input_shape.dims(),
         errors::InvalidArgument(
diff --git a/tensorflow/compiler/tf2xla/kernels/split_op.cc b/tensorflow/compiler/tf2xla/kernels/split_op.cc
index 230a343f796..7a0e240400b 100644
--- a/tensorflow/compiler/tf2xla/kernels/split_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/split_op.cc
@@ -35,26 +35,16 @@ class SplitOp : public XlaOpKernel {
 
   void Compile(XlaOpKernelContext* ctx) override {
     const int32 num_split = num_outputs();
-    const TensorShape index_shape = ctx->InputShape(0);
+    const TensorShape split_dim_shape = ctx->InputShape("split_dim");
     const TensorShape input_shape = ctx->InputShape(1);
 
-    xla::Literal literal_index;
-    OP_REQUIRES_OK(ctx, ctx->ConstantInput(0, &literal_index));
+    OP_REQUIRES(
+        ctx, TensorShapeUtils::IsScalar(split_dim_shape),
+        errors::InvalidArgument("split_dim must be a scalar but has rank ",
+                                split_dim_shape.dims()));
+    int64 split_dim_orig;
+    OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntScalar(0, &split_dim_orig));
 
-    int32 split_dim_orig;
-    if (index_shape.dims() == 0) {
-      split_dim_orig = literal_index.Get<int>({});
-    } else {
-      OP_REQUIRES(
-          ctx, index_shape.dims() == 1,
-          errors::InvalidArgument("split_index input to Split Op must be a "
-                                  "scalar or a vector with 1 element"));
-      OP_REQUIRES(
-          ctx, index_shape.dim_size(0) == 1,
-          errors::InvalidArgument("split_index input to Split Op must be a "
-                                  "scalar or a vector with 1 element"));
-      split_dim_orig = literal_index.Get<int>({0});
-    }
     int32 split_dim = split_dim_orig < 0 ? split_dim_orig + input_shape.dims()
                                          : split_dim_orig;
     OP_REQUIRES(ctx, 0 <= split_dim && split_dim < input_shape.dims(),
@@ -138,7 +128,6 @@ class SplitVOp : public XlaOpKernel {
     // Check that sizes are correct.
     int total_split_size = 0;
     int neg_one_dim = -1;
-    std::vector<int64> split_sizes_vec(num_split, -1);
     const TensorShape split_size_shape = ctx->InputShape(1);
     OP_REQUIRES(ctx,
                 split_size_shape.dims() == 1 &&
@@ -150,12 +139,11 @@ class SplitVOp : public XlaOpKernel {
                     split_size_shape.dims(), "-D and ",
                     split_size_shape.num_elements(), " elements"));
     // Get the dimension of this split.
-    xla::Literal split_size_literal;
-    OP_REQUIRES_OK(ctx, ctx->ConstantInput(1, &split_size_literal));
+    std::vector<int64> split_sizes;
+    OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntVector(1, &split_sizes));
 
     for (int i = 0; i < num_split; ++i) {
-      int slice_size;
-      slice_size = split_size_literal.Get<int>({i});
+      int64 slice_size = split_sizes[i];
       if (slice_size == -1) {
         OP_REQUIRES(
             ctx, neg_one_dim == -1,
@@ -164,7 +152,6 @@ class SplitVOp : public XlaOpKernel {
                                     i));
         neg_one_dim = i;
       } else {
-        split_sizes_vec[i] = slice_size;
         total_split_size += slice_size;
       }
     }
@@ -183,7 +170,7 @@ class SplitVOp : public XlaOpKernel {
                                 total_split_size));
 
     if (neg_one_dim >= 0) {
-      split_sizes_vec[neg_one_dim] =
+      split_sizes[neg_one_dim] =
           input_shape.dim_size(split_dim) - total_split_size;
     }
 
@@ -195,7 +182,7 @@ class SplitVOp : public XlaOpKernel {
     std::vector<int64> strides(input_shape.dims(), 1);
     for (int i = 0; i < num_split; ++i) {
       TensorShape output_shape(input_shape);
-      int slice_size = split_sizes_vec[i];
+      int slice_size = split_sizes[i];
       output_shape.set_dim(split_dim, slice_size);
 
       // Slice out the ith split from the split dimension.
diff --git a/tensorflow/compiler/tf2xla/kernels/stack_ops.cc b/tensorflow/compiler/tf2xla/kernels/stack_ops.cc
index d79cdad9fa2..7b96b43ad83 100644
--- a/tensorflow/compiler/tf2xla/kernels/stack_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/stack_ops.cc
@@ -126,7 +126,9 @@ class StackOp : public XlaOpKernel {
   TF_DISALLOW_COPY_AND_ASSIGN(StackOp);
 };
 
-REGISTER_XLA_OP(Name("StackV2").CompileTimeConstantInput("max_size"), StackOp);
+REGISTER_XLA_OP(
+    Name("StackV2").CompileTimeConstantInput("max_size").CompilationOnly(),
+    StackOp);
 
 class StackPushOp : public XlaOpKernel {
  public:
@@ -173,7 +175,7 @@ class StackPushOp : public XlaOpKernel {
   TF_DISALLOW_COPY_AND_ASSIGN(StackPushOp);
 };
 
-REGISTER_XLA_OP(Name("StackPushV2"), StackPushOp);
+REGISTER_XLA_OP(Name("StackPushV2").CompilationOnly(), StackPushOp);
 
 class StackPopOp : public XlaOpKernel {
  public:
@@ -227,7 +229,7 @@ class StackPopOp : public XlaOpKernel {
   TF_DISALLOW_COPY_AND_ASSIGN(StackPopOp);
 };
 
-REGISTER_XLA_OP(Name("StackPopV2"), StackPopOp);
+REGISTER_XLA_OP(Name("StackPopV2").CompilationOnly(), StackPopOp);
 
 class StackCloseOp : public XlaOpKernel {
  public:
@@ -241,7 +243,7 @@ class StackCloseOp : public XlaOpKernel {
   TF_DISALLOW_COPY_AND_ASSIGN(StackCloseOp);
 };
 
-REGISTER_XLA_OP(Name("StackCloseV2"), StackCloseOp);
+REGISTER_XLA_OP(Name("StackCloseV2").CompilationOnly(), StackCloseOp);
 
 }  // anonymous namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/tile_ops.cc b/tensorflow/compiler/tf2xla/kernels/tile_ops.cc
index 7b2cd5a5b08..e1c764f3d5c 100644
--- a/tensorflow/compiler/tf2xla/kernels/tile_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/tile_ops.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/type_index.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/macros.h"
@@ -44,7 +45,7 @@ class TileOp : public XlaOpKernel {
     const TensorShape multiples_shape = ctx->InputShape("multiples");
 
     OP_REQUIRES(
-        ctx, IsLegacyVector(multiples_shape),
+        ctx, TensorShapeUtils::IsVector(multiples_shape),
         errors::InvalidArgument("Expected multiples to be 1-D, but got shape ",
                                 multiples_shape.DebugString()));
     OP_REQUIRES(ctx, input_shape.dims() == multiples_shape.num_elements(),
diff --git a/tensorflow/compiler/tf2xla/kernels/transpose_op.cc b/tensorflow/compiler/tf2xla/kernels/transpose_op.cc
index 48a211942d7..c9b324a243e 100644
--- a/tensorflow/compiler/tf2xla/kernels/transpose_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/transpose_op.cc
@@ -37,8 +37,8 @@ class TransposeOp : public XlaOpKernel {
       : XlaOpKernel(ctx), conjugate_(conjugate) {}
 
   void Compile(XlaOpKernelContext* ctx) override {
-    const TensorShape input_shape = ctx->InputShape(0);
-    const TensorShape perm_tensor_shape = ctx->InputShape(1);
+    const TensorShape input_shape = ctx->InputShape("x");
+    const TensorShape perm_tensor_shape = ctx->InputShape("perm");
 
     // Preliminary validation of sizes.
     OP_REQUIRES(ctx, TensorShapeUtils::IsVector(perm_tensor_shape),
@@ -52,19 +52,15 @@ class TransposeOp : public XlaOpKernel {
                                         ". But input(1) is a vector of size ",
                                         perm_tensor_shape.num_elements()));
 
-    xla::Literal literal;
-    OP_REQUIRES_OK(ctx, ctx->ConstantInputReshaped(1, {dims}, &literal));
-
-    std::vector<int32> perm(dims);
-    std::copy(literal.data<int32>().begin(), literal.data<int32>().end(),
-              perm.begin());
+    std::vector<int64> perm;
+    OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntVector("perm", &perm));
 
     std::vector<int64> transposed_order;
     // Check whether permutation is a permutation of integers of [0 .. dims).
     absl::InlinedVector<bool, 8> bits(dims);
     bool is_identity = true;
     for (int i = 0; i < dims; ++i) {
-      const int32 d = perm[i];
+      const int64 d = perm[i];
       OP_REQUIRES(
           ctx, 0 <= d && d < dims,
           errors::InvalidArgument(d, " is out of range [0 .. ", dims, ")"));
@@ -83,9 +79,9 @@ class TransposeOp : public XlaOpKernel {
     xla::XlaOp transposed;
     // 0-D, 1-D, and identity transposes do nothing.
     if (dims <= 1 || is_identity) {
-      transposed = ctx->Input(0);
+      transposed = ctx->Input("x");
     } else {
-      transposed = xla::Transpose(ctx->Input(0), transposed_order);
+      transposed = xla::Transpose(ctx->Input("x"), transposed_order);
     }
 
     // Conjugate the transposed result if this is ConjugateTransposeOp.
diff --git a/tensorflow/compiler/tf2xla/kernels/unary_ops.cc b/tensorflow/compiler/tf2xla/kernels/unary_ops.cc
index 0bdfc057261..a0ea6422d73 100644
--- a/tensorflow/compiler/tf2xla/kernels/unary_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/unary_ops.cc
@@ -80,24 +80,8 @@ XLAJIT_MAKE_UNARY(Invert, xla::Not(x));
 XLAJIT_MAKE_UNARY(LogicalNot, xla::Not(x));
 XLAJIT_MAKE_UNARY(Neg, -x);
 
-// Implements Banker's rounding: numbers that are equidistant between two
-// integers are rounded towards even.
-xla::XlaOp RoundToEven(xla::XlaOp x) {
-  auto half = xla::ScalarLike(x, 0.5);
-  auto one = xla::ScalarLike(x, 1.0);
-  auto two = xla::ScalarLike(x, 2.0);
-
-  auto round_val = xla::Floor(x);
-  auto fraction = x - round_val;
-  auto nearest_even_int = round_val - two * xla::Floor(half * x);
-  auto is_odd = xla::Eq(nearest_even_int, one);
-  return xla::Select(xla::Or(xla::Gt(fraction, half),
-                             xla::And(xla::Eq(fraction, half), is_odd)),
-                     round_val + one, round_val);
-}
-
-XLAJIT_MAKE_UNARY(Rint, RoundToEven(x));
-XLAJIT_MAKE_UNARY(Round, RoundToEven(x));
+XLAJIT_MAKE_UNARY(Rint, xla::RoundToEven(x));
+XLAJIT_MAKE_UNARY(Round, xla::RoundToEven(x));
 
 XLAJIT_MAKE_UNARY(Rsqrt, xla::Rsqrt(x));
 
diff --git a/tensorflow/compiler/tf2xla/literal_util.cc b/tensorflow/compiler/tf2xla/literal_util.cc
index 20103ec3ae0..67d08290033 100644
--- a/tensorflow/compiler/tf2xla/literal_util.cc
+++ b/tensorflow/compiler/tf2xla/literal_util.cc
@@ -32,6 +32,12 @@ Status HostTensorToBorrowingLiteral(const Tensor& host_tensor,
   return Status::OK();
 }
 
+xla::StatusOr<xla::Literal> HostTensorToLiteral(const Tensor& host_tensor) {
+  xla::BorrowingLiteral literal;
+  TF_RETURN_IF_ERROR(HostTensorToBorrowingLiteral(host_tensor, &literal));
+  return literal.Clone();
+}
+
 Status HostTensorToMutableBorrowingLiteral(
     Tensor* host_tensor, xla::MutableBorrowingLiteral* literal) {
   xla::Shape xla_shape;
diff --git a/tensorflow/compiler/tf2xla/literal_util.h b/tensorflow/compiler/tf2xla/literal_util.h
index 1db7470ee2a..a153dddee61 100644
--- a/tensorflow/compiler/tf2xla/literal_util.h
+++ b/tensorflow/compiler/tf2xla/literal_util.h
@@ -30,6 +30,11 @@ namespace tensorflow {
 // 'host_tensor'.
 Status HostTensorToBorrowingLiteral(const Tensor& host_tensor,
                                     xla::BorrowingLiteral* literal);
+
+// Returns a Literal with the contents of 'host_tensor', backed by its own
+// storage (i.e., not reusing 'host_tensor's buffers.)
+xla::StatusOr<xla::Literal> HostTensorToLiteral(const Tensor& host_tensor);
+
 // Returns a MutableBorrowingLiteral that utilizes the same underlying buffer
 // owned by 'host_tensor', but is mutable via the xla::Literal methods.
 Status HostTensorToMutableBorrowingLiteral(
diff --git a/tensorflow/compiler/tf2xla/python/BUILD b/tensorflow/compiler/tf2xla/python/BUILD
index 8b559c87506..c9f486edc8d 100644
--- a/tensorflow/compiler/tf2xla/python/BUILD
+++ b/tensorflow/compiler/tf2xla/python/BUILD
@@ -3,6 +3,7 @@ licenses(["notice"])  # Apache 2.0
 package(
     default_visibility = [
         "//learning/deepmind/public/wavenet/python:__subpackages__",
+        "//learning/deepmind/research/alphastar:__subpackages__",
         "//learning/tfx:__subpackages__",
         "//tensorflow:internal",
     ],
diff --git a/tensorflow/compiler/tf2xla/xla_compilation_device.cc b/tensorflow/compiler/tf2xla/xla_compilation_device.cc
index cb7843850c3..ddb284966ee 100644
--- a/tensorflow/compiler/tf2xla/xla_compilation_device.cc
+++ b/tensorflow/compiler/tf2xla/xla_compilation_device.cc
@@ -124,13 +124,4 @@ Status XlaCompilationDevice::MakeTensorFromProto(
       "XLACompilationDevice::MakeTensorFromProto should not be called");
 }
 
-XlaExpression::XlaExpression() = default;
-
-void XlaExpression::set_handle(const xla::XlaOp& h) { handle_ = h; }
-
-void XlaExpression::set_constant_value(Tensor value) {
-  has_constant_value_ = true;
-  constant_value_ = std::move(value);
-}
-
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/xla_compilation_device.h b/tensorflow/compiler/tf2xla/xla_compilation_device.h
index a6e78825334..de6a3356e05 100644
--- a/tensorflow/compiler/tf2xla/xla_compilation_device.h
+++ b/tensorflow/compiler/tf2xla/xla_compilation_device.h
@@ -18,9 +18,6 @@ limitations under the License.
 
 #include <memory>
 
-#include "tensorflow/compiler/tf2xla/xla_resource.h"
-#include "tensorflow/compiler/xla/client/xla_builder.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/common_runtime/local_device.h"
 #include "tensorflow/core/framework/device_base.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -38,8 +35,8 @@ class XlaCompilationAllocator;
 // This is a 'dummy' TensorFlow device that is only used to execute a
 // subgraph of XLA compilation Ops to construct a compiled version
 // of the subgraph's computation. It has a 'dummy' allocator that
-// backs each Tensor with metadata indicating the computation the
-// Tensor represents.
+// backs each Tensor with an XlaExpression. The shape of the Tensor
+// matches the shape of XlaExpression.
 //
 // We deliberately don't register a device factory because we *never*
 // want placement to put Ops on a compilation device. The device is created
@@ -67,40 +64,6 @@ class XlaCompilationDevice : public LocalDevice {
   std::unique_ptr<XlaCompilationAllocator> allocator_;
 };
 
-// A XlaExpression wraps an XLA computation. Each Tensor on an
-// XlaCompilationDevice contains an XlaExpression, and the shape of the Tensor
-// matches the shape of the subcomputation in the XlaOp. Each
-// expression is either a constant, or a function of previously-compiled
-// expressions.
-class XlaExpression {
- public:
-  XlaExpression();
-
-  // handle() stores the XLA handle of the computation that the
-  // expression represents.
-  void set_handle(const xla::XlaOp& h);
-  const xla::XlaOp& handle() const { return handle_; }
-
-  void set_constant_value(Tensor value);
-  bool has_constant_value() const { return has_constant_value_; }
-  const Tensor& constant_value() const { return constant_value_; }
-
-  void set_resource(XlaResource* resource) { resource_ = resource; }
-  XlaResource* resource() const { return resource_; }
-
- private:
-  // The XLA handle of the expression's computation.
-  xla::XlaOp handle_;
-
-  // If this expression is a constant with a known value, 'constant_value' is a
-  // host-memory Tensor containing the value. Used to avoid invoking XLA for
-  // expressions that are trivially constant.
-  bool has_constant_value_ = false;
-  Tensor constant_value_;
-
-  XlaResource* resource_ = nullptr;  // Not owned.
-};
-
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_COMPILER_TF2XLA_XLA_COMPILATION_DEVICE_H_
diff --git a/tensorflow/compiler/tf2xla/xla_compiler.cc b/tensorflow/compiler/tf2xla/xla_compiler.cc
index e177a5f07f5..a08d030ce71 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler.cc
+++ b/tensorflow/compiler/tf2xla/xla_compiler.cc
@@ -36,10 +36,13 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/common_runtime/graph_optimizer.h"
 #include "tensorflow/core/framework/attr_value_util.h"
+#include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/graph/algorithm.h"
 #include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/node_builder.h"
+#include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/lib/hash/hash.h"
 #include "tensorflow/core/platform/logging.h"
 
@@ -48,7 +51,7 @@ namespace {
 
 // Checks that arguments `args` match types `types`.
 Status CheckSignature(const DataTypeVector& types,
-                      const std::vector<XlaCompiler::Argument>& args) {
+                      absl::Span<const XlaCompiler::Argument> args) {
   if (args.size() != types.size()) {
     return errors::Internal("Compilation arguments have ", args.size(),
                             " elements while function has ", types.size());
@@ -63,6 +66,240 @@ Status CheckSignature(const DataTypeVector& types,
   return Status::OK();
 }
 
+// Uses the _Arg and _Retval nodes in the graph to determine a core assignment
+// for each argument and return value.
+xla::StatusOr<std::pair<std::map<int, int>, std::map<int, int>>>
+ComputeArgAndRetvalCores(const Graph& graph) {
+  auto get_sharding_for_node = [](const Node* n) -> xla::StatusOr<int> {
+    TF_ASSIGN_OR_RETURN(
+        auto sharding,
+        ParseShardingFromDevice(*n, std::numeric_limits<int32>::max()));
+    if (sharding.has_value()) {
+      TF_RET_CHECK(sharding.value().type() ==
+                   xla::OpSharding::Type::OpSharding_Type_MAXIMAL);
+      return sharding.value().tile_assignment_devices(0);
+    } else {
+      return -1;
+    }
+  };
+  std::map<int, int> arg_cores;
+  std::map<int, int> retval_cores;
+  for (const Node* n : graph.nodes()) {
+    if (n->type_string() == FunctionLibraryDefinition::kArgOp) {
+      TF_ASSIGN_OR_RETURN(int core, get_sharding_for_node(n));
+      if (core < 0) continue;
+      int index;
+      TF_RETURN_IF_ERROR(GetNodeAttr(n->attrs(), "index", &index));
+      TF_RET_CHECK(index >= 0) << "Negative _Arg index";
+      arg_cores[index] = core;
+    } else if (n->type_string() == FunctionLibraryDefinition::kRetOp) {
+      TF_ASSIGN_OR_RETURN(int core, get_sharding_for_node(n));
+      if (core < 0) continue;
+      int index;
+      TF_RETURN_IF_ERROR(GetNodeAttr(n->attrs(), "index", &index));
+      TF_RET_CHECK(index >= 0) << "Negative _Retval index";
+      TF_ASSIGN_OR_RETURN(retval_cores[index], get_sharding_for_node(n));
+      retval_cores[index] = core;
+    }
+  }
+  return std::make_pair(std::move(arg_cores), std::move(retval_cores));
+}
+
+Status ExecuteGraph(XlaContext* xla_context, std::unique_ptr<Graph> graph,
+                    XlaCompilationDevice* device, FunctionLibraryRuntime* flib,
+                    int64 step_id) {
+  // Resource cleanup is a bit messy. XlaContext is a ref-countd resource; the
+  // resource manager takes ownership via Create, and unrefs via Cleanup.  We
+  // explicitly add a reference to ensure the refcount at entry is maintained at
+  // all exit points; Create and Cleanup are always called in this function.
+  //
+  // The Executor requires us to use ScopedStepContainer. We wrap it in a
+  // unique_ptr so we can capture the cleanup status in the end.
+  xla_context->Ref();
+  Status status;
+  auto step_container = absl::make_unique<ScopedStepContainer>(
+      step_id, [&status, device](const string& name) {
+        status = device->resource_manager()->Cleanup(name);
+      });
+  TF_RETURN_IF_ERROR(device->resource_manager()->Create(
+      step_container->name(), XlaContext::kXlaContextResourceName,
+      xla_context));
+
+  GraphCompiler graph_compiler(device, graph.get(), flib, step_container.get());
+  TF_RETURN_IF_ERROR(graph_compiler.Compile());
+  // Explicitly clean up the step container, to capture the cleanup status.
+  step_container.reset();
+  return Status::OK();
+}
+
+// Builds the XLA computation.
+// - `args` is the list of input arguments
+// - `retvals` is the list of retvals produced by _Retval operators, in index
+//   order.
+// - `args_core` and `retval_cores` are mapping from arg/return indices to core
+//   assignments.
+// - If `return_updated_values_for_all_resources` is true, all resources will be
+//   included in `resource_updates`, regardless of whether their value changed.
+// - Sets `*num_nonconst_outputs` to the number of outputs of the `computation`.
+// - Sets `*resource_updates` to a description of resources whose values are
+//   written by the computation; the variable writes are the last
+// - `resource_updates.size()` return values from the computation. Each entry in
+//   `resource_updates` is a ResourceUpdate, whose `index` is the index of a
+//   resource variable argument to the computation to be updated, and `type` is
+//   the type of the final output.
+Status BuildComputation(
+    const std::vector<XlaCompiler::Argument>& args,
+    const std::vector<XlaExpression>& retvals,
+    const std::map<int, int>& arg_cores, const std::map<int, int>& retval_cores,
+    const std::vector<std::unique_ptr<XlaResource>>& resources,
+    std::unique_ptr<xla::XlaOp> token_output,
+    const XlaCompiler::ShapeRepresentationFn& shape_representation_fn,
+    bool return_updated_values_for_all_resources, bool always_return_tuple,
+    xla::XlaBuilder* builder, xla::XlaComputation* computation,
+    int* num_computation_outputs, int* num_nonconst_outputs,
+    std::vector<XlaCompiler::OutputDescription>* outputs,
+    std::vector<XlaCompiler::ResourceUpdate>* resource_updates) {
+  // Attach a common operator name as metadata. This has no semantic effect — it
+  // merely makes the HLO graph more readable when visualized via TensorBoard,
+  // since TensorBoard forms groups out of operators with similar names.
+  xla::OpMetadata retval_metadata;
+  retval_metadata.set_op_name("XLA_Retvals");
+  builder->SetOpMetadata(retval_metadata);
+  auto cleanup = gtl::MakeCleanup([builder]() { builder->ClearOpMetadata(); });
+
+  // Builds a no-op XLA computation. We need to set the sharding of outputs, but
+  // cannot change the sharding of the existing output op. To do this, we build
+  // a new identity op to which shardings can be applied.
+  auto identity_op = [builder](xla::XlaOp op) {
+    return xla::GetTupleElement(xla::Tuple(builder, {op}), 0);
+  };
+
+  std::vector<xla::XlaOp> elems;
+  elems.reserve(retvals.size());
+  for (int i = 0; i < retvals.size(); ++i) {
+    XlaCompiler::OutputDescription& output = (*outputs)[i];
+    const XlaExpression& retval = retvals[i];
+    output.type = retval.dtype();
+    switch (retval.kind()) {
+      case XlaExpression::Kind::kConstant:
+        output.is_constant = true;
+        output.constant_value = retval.constant_value();
+        output.shape = output.constant_value.shape();
+        break;
+
+      case XlaExpression::Kind::kXlaOp: {
+        output.is_constant = false;
+        TF_ASSIGN_OR_RETURN(output.shape, retval.GetShape());
+        xla::XlaOp value = retval.handle();
+        auto it = retval_cores.find(i);
+        xla::XlaScopedShardingAssignment assign_sharding(
+            builder, it == retval_cores.end()
+                         ? absl::optional<xla::OpSharding>()
+                         : xla::sharding_builder::AssignDevice(it->second));
+        if (shape_representation_fn) {
+          // If there is a shape representation function, reshape the output
+          // tensor to the shape given by the representation shape function.
+          TF_ASSIGN_OR_RETURN(xla::Shape shape, shape_representation_fn(
+                                                    output.shape, output.type));
+          value = xla::Reshape(value, xla::AsInt64Slice(shape.dimensions()));
+        } else if (it != retval_cores.end()) {
+          // Apply the sharding to the output, if there is a core assignment.
+          value = identity_op(value);
+        }
+        elems.push_back(value);
+        break;
+      }
+
+      case XlaExpression::Kind::kResource:
+        output.is_constant = false;
+        output.input_index = retval.resource()->arg_num();
+        output.shape = retval.resource()->shape();
+        break;
+
+      case XlaExpression::Kind::kInvalid:
+        return errors::InvalidArgument(
+            "Invalid expression returned by computation. "
+            "This probably means a return value was not set.");
+    }
+  }
+  *num_nonconst_outputs = elems.size();
+
+  // Add return values for resources whose values have changed.
+  std::vector<const XlaResource*> arg_resources;
+  arg_resources.reserve(resources.size());
+  for (const auto& resource : resources) {
+    if (resource->arg_num() >= 0) {
+      arg_resources.push_back(resource.get());
+    }
+  }
+  std::sort(arg_resources.begin(), arg_resources.end(),
+            [](const XlaResource* a, const XlaResource* b) {
+              return a->arg_num() < b->arg_num();
+            });
+
+  for (const XlaResource* resource : arg_resources) {
+    DCHECK_LT(resource->arg_num(), args.size());
+    const XlaCompiler::Argument& arg = args[resource->arg_num()];
+    auto it = arg_cores.find(resource->arg_num());
+    const int core = it == arg_cores.end() ? -1 : it->second;
+    bool modified = !resource->value().IsIdenticalTo(resource->initial_value());
+    // TensorArray gradients were modified if their values changed or there are
+    // any newly created gradients.
+    for (const auto& grad : resource->tensor_array_gradients()) {
+      modified =
+          modified ||
+          !grad.second->value().IsIdenticalTo(grad.second->initial_value()) ||
+          arg.tensor_array_gradients.count(grad.first) == 0;
+    }
+    if (return_updated_values_for_all_resources || modified) {
+      resource_updates->emplace_back();
+      XlaCompiler::ResourceUpdate& update = resource_updates->back();
+      update.input_index = resource->arg_num();
+      update.type = resource->type();
+      update.shape = resource->shape();
+      update.modified = modified;
+      for (const auto& grad : resource->tensor_array_gradients()) {
+        update.tensor_array_gradients_accessed.insert(grad.first);
+      }
+
+      // Request that the value be returned on a specific core.
+      xla::XlaScopedShardingAssignment assign_sharding(
+          builder, core == -1 ? absl::optional<xla::OpSharding>()
+                              : xla::sharding_builder::AssignDevice(core));
+
+      xla::XlaOp handle;
+      TF_RETURN_IF_ERROR(resource->Pack(&handle, builder));
+
+      // Ensures the correct sharding is applied to the output.
+      handle = identity_op(handle);
+
+      elems.push_back(handle);
+    }
+  }
+
+  // If we have token output, append it as the last one.
+  if (token_output) {
+    elems.push_back(*token_output);
+  }
+
+  *num_computation_outputs = elems.size();
+
+  // Builds the XLA computation. We *always* form a tuple here to ensure that
+  // the output value is the last thing added into the XLA computation, even
+  // if there is only one output value.
+  auto tuple = xla::Tuple(builder, elems);
+  if (!always_return_tuple && elems.size() == 1) {
+    xla::GetTupleElement(tuple, 0);
+  }
+
+  xla::StatusOr<xla::XlaComputation> computation_status = builder->Build();
+  if (!computation_status.ok()) {
+    return computation_status.status();
+  }
+  *computation = computation_status.ConsumeValueOrDie();
+  return Status::OK();
+}
+
 }  // namespace
 
 bool XlaCompiler::Argument::operator==(
@@ -83,6 +320,39 @@ bool XlaCompiler::Argument::operator==(
   return constant_value.tensor_data() == other.constant_value.tensor_data();
 }
 
+string XlaCompiler::Argument::HumanString() const {
+  string common;
+  if (!name.empty()) {
+    common = absl::StrCat(" name=", name);
+  }
+  absl::StrAppend(&common, " type=", DataTypeString(type),
+                  " shape=", shape.DebugString());
+  switch (kind) {
+    case kInvalid:
+      return "invalid";
+    case kConstant:
+      return absl::StrCat("kind=constant", common,
+                          " value=", constant_value.DebugString());
+    case kResource: {
+      string output = absl::StrCat("kind=resource", common, " resource_kind=",
+                                   XlaResource::KindToString(resource_kind),
+                                   " initialized=", initialized);
+      if (tensor_array_size >= 0) {
+        absl::StrAppend(&output, " tensor_array_size=", tensor_array_size);
+      }
+      if (!tensor_array_gradients.empty()) {
+        absl::StrAppend(&output, " tensor_array_gradients=",
+                        absl::StrJoin(tensor_array_gradients, ","));
+      }
+      return output;
+    }
+    case kParameter:
+      return absl::StrCat("kind=parameter", common);
+    case kToken:
+      return absl::StrCat("token", common);
+  }
+}
+
 XlaCompiler::XlaCompiler(XlaCompiler::Options options)
     : options_(options),
       initialization_status_(Status::OK()),
@@ -110,8 +380,13 @@ XlaCompiler::XlaCompiler(XlaCompiler::Options options)
 
   // The default shape representation function is the identity.
   if (!options_.shape_representation_fn) {
-    options_.shape_representation_fn = [](const TensorShape& shape,
-                                          DataType type) { return shape; };
+    options_.shape_representation_fn =
+        [](const TensorShape& shape,
+           DataType dtype) -> xla::StatusOr<xla::Shape> {
+      xla::Shape xla_shape;
+      TF_RETURN_IF_ERROR(TensorShapeToXLAShape(dtype, shape, &xla_shape));
+      return xla_shape;
+    };
   }
 }
 
@@ -171,15 +446,16 @@ std::unique_ptr<Graph> XlaCompiler::GetGraph(const FunctionBody* fbody) {
   return graph;
 }
 
-Status XlaCompiler::CompileFunction(const XlaCompiler::CompileOptions& options,
-                                    const NameAttrList& function,
-                                    std::vector<XlaCompiler::Argument> args,
-                                    XlaCompiler::CompilationResult* result) {
+Status XlaCompiler::CompileFunction(
+    const XlaCompiler::CompileOptions& options, const NameAttrList& function,
+    absl::Span<const XlaCompiler::Argument> args,
+    XlaCompiler::CompilationResult* result) {
   const string function_id =
       Canonicalize(function.name(), AttrSlice(&function.attr()));
   VLOG(1) << "XlaCompiler::CompileFunction " << function_id;
 
-  auto it = cache_.find({function_id, args});
+  const std::vector<XlaCompiler::Argument> arg_vector(args.begin(), args.end());
+  auto it = cache_.find({function_id, arg_vector});
   if (it != cache_.end()) {
     *result = it->second;
     return Status::OK();
@@ -212,14 +488,16 @@ Status XlaCompiler::CompileFunction(const XlaCompiler::CompileOptions& options,
   // lowest-numbered core that consumes the argument. We choose the
   // lowest-numbered core so the assignment is deterministic.
   for (Node* n : graph->nodes()) {
-    if (absl::string_view(n->type_string()) == "_Arg") {
+    if (absl::string_view(n->type_string()) ==
+        FunctionLibraryDefinition::kArgOp) {
       TF_RETURN_IF_ERROR(SetNodeShardingFromNeighbors(n, /*out_edges=*/true));
     }
   }
   // Do _Retval as a second loop, in case the retval's input is an _Arg (which
   // may have gotten a device assignment from the first loop).
   for (Node* n : graph->nodes()) {
-    if (absl::string_view(n->type_string()) == "_Retval") {
+    if (absl::string_view(n->type_string()) ==
+        FunctionLibraryDefinition::kRetOp) {
       TF_RETURN_IF_ERROR(SetNodeShardingFromNeighbors(n, /*out_edges=*/false));
     }
   }
@@ -235,7 +513,7 @@ Status XlaCompiler::CompileFunction(const XlaCompiler::CompileOptions& options,
       CompileGraph(options, function_id, std::move(graph), args, result));
   VLOG(1) << "====================================================";
 
-  cache_[{function_id, args}] = *result;
+  cache_[{function_id, arg_vector}] = *result;
   return Status::OK();
 }
 
@@ -247,25 +525,24 @@ Status XlaCompiler::XLAShapeForArgument(const XlaCompiler::Argument& arg,
     case XlaCompiler::Argument::kConstant:
       LOG(FATAL) << "Unreachable case";
     case XlaCompiler::Argument::kParameter: {
-      TensorShape shape;
       if (is_entry_computation) {
         TF_ASSIGN_OR_RETURN(
-            shape, options_.shape_representation_fn(arg.shape, arg.type));
+            *xla_shape, options_.shape_representation_fn(arg.shape, arg.type));
       } else {
-        shape = arg.shape;
+        TF_RETURN_IF_ERROR(
+            TensorShapeToXLAShape(arg.type, arg.shape, xla_shape));
       }
-      return TensorShapeToXLAShape(arg.type, shape, xla_shape);
+      return Status::OK();
     }
     case XlaCompiler::Argument::kResource: {
       TF_RET_CHECK(arg.initialized);
 
       switch (arg.resource_kind) {
         case XlaResource::kVariable: {
-          TF_ASSIGN_OR_RETURN(
-              TensorShape representation_shape,
-              options_.shape_representation_fn(arg.shape, arg.type));
-          return TensorShapeToXLAShape(arg.type, representation_shape,
-                                       xla_shape);
+          TF_ASSIGN_OR_RETURN(*xla_shape, options_.shape_representation_fn(
+                                              arg.shape, arg.type));
+
+          return Status::OK();
         }
         case XlaResource::kTensorArray: {
           if (arg.tensor_array_size < 0) {
@@ -314,175 +591,16 @@ Status XlaCompiler::XLAShapeForArgument(const XlaCompiler::Argument& arg,
   }
 }
 
-namespace {
-
-Status ExecuteGraph(XlaContext* xla_context, std::unique_ptr<Graph> graph,
-                    XlaCompilationDevice* device, FunctionLibraryRuntime* flib,
-                    int64 step_id) {
-  // Resource cleanup is a bit messy. XlaContext is a ref-countd resource; the
-  // resource manager takes ownership via Create, and unrefs via Cleanup.  We
-  // explicitly add a reference to ensure the refcount at entry is maintained at
-  // all exit points; Create and Cleanup are always called in this function.
-  //
-  // The Executor requires us to use ScopedStepContainer. We wrap it in a
-  // unique_ptr so we can capture the cleanup status in the end.
-  xla_context->Ref();
-  Status status;
-  auto step_container = absl::make_unique<ScopedStepContainer>(
-      step_id, [&status, device](const string& name) {
-        status = device->resource_manager()->Cleanup(name);
-      });
-  TF_RETURN_IF_ERROR(device->resource_manager()->Create(
-      step_container->name(), XlaContext::kXlaContextResourceName,
-      xla_context));
-
-  GraphCompiler graph_compiler(device, graph.get(), flib, step_container.get());
-  TF_RETURN_IF_ERROR(graph_compiler.Compile());
-  // Explicitly clean up the step container, to capture the cleanup status.
-  step_container.reset();
-  return Status::OK();
-}
-
-// Builds the XLA computation.
-// `args` is the list of input arguments, `retvals` is the list of retvals
-// produced by _Retval operators, in index order.
-// If `return_updated_values_for_all_resources` is true, all resources will be
-// included in `resource_updates`, regardless of whether their value changed.
-// Sets `*num_nonconst_outputs` to the number of outputs of the `computation`.
-// Sets `*resource_updates` to a description of resources whose values are
-// written by the computation; the variable writes are the last
-// `resource_updates.size()` return values from the computation. Each entry in
-// `resource_updates` is a (input_index, type) pair, where `input_index` is the
-// index of a resource variable argument to the computation, and `type` is the
-// type of the final output.
-Status BuildComputation(
-    const std::vector<XlaCompiler::Argument>& args,
-    const std::vector<int>& arg_cores,
-    const std::vector<XlaContext::Retval>& retvals,
-    const std::vector<std::unique_ptr<XlaResource>>& resources,
-    std::unique_ptr<xla::XlaOp> token_output,
-    bool return_updated_values_for_all_resources, bool always_return_tuple,
-    xla::XlaBuilder* builder, xla::XlaComputation* computation,
-    int* num_computation_outputs, int* num_nonconst_outputs,
-    std::vector<XlaCompiler::OutputDescription>* outputs,
-    std::vector<XlaCompiler::ResourceUpdate>* resource_updates) {
-  std::vector<xla::XlaOp> elems;
-  elems.reserve(retvals.size());
-  for (int i = 0; i < retvals.size(); ++i) {
-    XlaCompiler::OutputDescription& output = (*outputs)[i];
-    output.type = retvals[i].type;
-    output.shape = retvals[i].shape;
-    const XlaExpression& retval = retvals[i].expression;
-    if (retval.has_constant_value()) {
-      output.is_constant = true;
-      output.constant_value = retval.constant_value();
-    } else if (retval.resource() != nullptr) {
-      output.is_constant = false;
-      output.input_index = retval.resource()->arg_num();
-    } else {
-      output.is_constant = false;
-      elems.push_back(retval.handle());
-    }
-  }
-  *num_nonconst_outputs = elems.size();
-
-  // Add return values for resources whose values have changed.
-  std::vector<const XlaResource*> arg_resources;
-  arg_resources.reserve(resources.size());
-  for (const auto& resource : resources) {
-    if (resource->arg_num() >= 0) {
-      arg_resources.push_back(resource.get());
-    }
-  }
-  std::sort(arg_resources.begin(), arg_resources.end(),
-            [](const XlaResource* a, const XlaResource* b) {
-              return a->arg_num() < b->arg_num();
-            });
-
-  // Attach a common operator name as metadata. This has no semantic effect — it
-  // merely makes the HLO graph more readable when visualized via TensorBoard,
-  // since TensorBoard forms groups out of operators with similar names.
-  xla::OpMetadata retval_metadata;
-  retval_metadata.set_op_name("XLA_Retvals");
-  builder->SetOpMetadata(retval_metadata);
-
-  for (const XlaResource* resource : arg_resources) {
-    const XlaCompiler::Argument& arg = args[resource->arg_num()];
-    const int core = arg_cores[resource->arg_num()];
-    DCHECK_LT(resource->arg_num(), arg_cores.size());
-    bool modified = !resource->value().IsIdenticalTo(resource->initial_value());
-    // TensorArray gradients were modified if their values changed or there are
-    // any newly created gradients.
-    for (const auto& grad : resource->tensor_array_gradients()) {
-      modified =
-          modified ||
-          !grad.second->value().IsIdenticalTo(grad.second->initial_value()) ||
-          arg.tensor_array_gradients.count(grad.first) == 0;
-    }
-    if (return_updated_values_for_all_resources || modified) {
-      resource_updates->emplace_back();
-      XlaCompiler::ResourceUpdate& update = resource_updates->back();
-      update.input_index = resource->arg_num();
-      update.type = resource->type();
-      update.shape = resource->shape();
-      update.modified = modified;
-      for (const auto& grad : resource->tensor_array_gradients()) {
-        update.tensor_array_gradients_accessed.insert(grad.first);
-      }
-
-      // Request that the value be returned on a specific core.
-      xla::XlaScopedShardingAssignment assign_sharding(
-          builder, core == -1 ? absl::optional<xla::OpSharding>()
-                              : xla::sharding_builder::AssignDevice(core));
-
-      xla::XlaOp handle;
-      TF_RETURN_IF_ERROR(resource->Pack(&handle, builder));
-
-      // Since we can't change the sharding metadata of <value> as this point,
-      // create a tuple/get-tuple-element combination so that sharding
-      // assignment will be placed on this value, which will cause the resource
-      // update to be returned from the same device that provided the resource.
-      handle = xla::GetTupleElement(xla::Tuple(builder, {handle}), 0);
-      elems.push_back(handle);
-    }
-  }
-
-  // If we have token output, append it as the last one.
-  if (token_output) {
-    elems.push_back(*token_output);
-  }
-
-  *num_computation_outputs = elems.size();
-
-  // Builds the XLA computation. We *always* form a tuple here to ensure that
-  // the output value is the last thing added into the XLA computation, even
-  // if there is only one output value.
-  auto tuple = xla::Tuple(builder, elems);
-  if (!always_return_tuple && elems.size() == 1) {
-    xla::GetTupleElement(tuple, 0);
-  }
-  builder->ClearOpMetadata();
-
-  xla::StatusOr<xla::XlaComputation> computation_status = builder->Build();
-  if (!computation_status.ok()) {
-    return computation_status.status();
-  }
-  *computation = computation_status.ConsumeValueOrDie();
-  return Status::OK();
-}
-
-}  // namespace
-
 // Builds XLA computations for each of the arguments to the computation.
 // `args` are the arguments to the computation.
 Status XlaCompiler::BuildArguments(
     const Graph& graph, const std::vector<XlaCompiler::Argument>& args,
     bool use_tuple_arg, xla::XlaBuilder* builder, XlaContext* context,
-    std::vector<int>* arg_cores, std::vector<XlaExpression>* arg_expressions,
+    const std::map<int, int>& arg_cores,
+    std::vector<XlaExpression>* arg_expressions,
     std::vector<int>* input_mapping, std::vector<xla::Shape>* input_shapes,
     bool is_entry_computation) {
   arg_expressions->resize(args.size());
-  *arg_cores = std::vector<int>(args.size(), -1);
 
   // Argument numbers of arguments and resources that are to be passed to the
   // XLA computation as runtime parameters.
@@ -504,7 +622,7 @@ Status XlaCompiler::BuildArguments(
             arg.resource_kind, i, arg.name, arg.type, arg.shape, xla::XlaOp(),
             /*tensor_array_size=*/arg.tensor_array_size,
             /*tensor_array_gradients=*/arg.tensor_array_gradients, &resource));
-        arg_expression.set_resource(resource);
+        arg_expression = XlaExpression::Resource(resource);
         if (arg.initialized) {
           input_mapping->push_back(i);
         }
@@ -516,7 +634,7 @@ Status XlaCompiler::BuildArguments(
         break;
       }
       case XlaCompiler::Argument::kConstant:
-        arg_expression.set_constant_value(arg.constant_value);
+        arg_expression = XlaExpression::Constant(arg.constant_value);
         break;
       case XlaCompiler::Argument::kInvalid:
         return errors::Internal(
@@ -541,26 +659,6 @@ Status XlaCompiler::BuildArguments(
     *input_shapes = arg_shapes;
   }
 
-  // Use the _Arg nodes in the graph to resolve core assignments.
-  for (const Node* n : graph.nodes()) {
-    if (absl::string_view(n->type_string()) != "_Arg") continue;
-    int index;
-    TF_RETURN_IF_ERROR(GetNodeAttr(n->attrs(), "index", &index));
-    TF_RET_CHECK(index >= 0 && index < args.size())
-        << "_Arg out of bounds: " << index << " vs " << args.size();
-    TF_ASSIGN_OR_RETURN(
-        auto sharding,
-        ParseShardingFromDevice(*n, std::numeric_limits<int32>::max()));
-    if (sharding.has_value()) {
-      TF_RET_CHECK(sharding.value().type() ==
-                   xla::OpSharding::Type::OpSharding_Type_MAXIMAL);
-      const int core = sharding.value().tile_assignment_devices(0);
-      if ((*arg_cores)[index] == -1 || core < (*arg_cores)[index]) {
-        (*arg_cores)[index] = core;
-      }
-    }
-  }
-
   // Attach a common operator name as metadata. This has no semantic effect — it
   // merely makes the HLO graph more readable when visualized via TensorBoard,
   // since TensorBoard forms groups out of operators with similar names.
@@ -576,11 +674,10 @@ Status XlaCompiler::BuildArguments(
       xla::OpSharding tuple_sharding;
       tuple_sharding.set_type(xla::OpSharding::Type::OpSharding_Type_TUPLE);
       for (int64 parameter : *input_mapping) {
-        const int core = (*arg_cores)[parameter];
-        const int root_device = 0;
+        auto it = arg_cores.find(parameter);
+        const int core = it == arg_cores.end() ? 0 : it->second;
         *tuple_sharding.add_tuple_shardings() =
-            core == -1 ? xla::sharding_builder::AssignDevice(root_device)
-                       : xla::sharding_builder::AssignDevice(core);
+            xla::sharding_builder::AssignDevice(core);
       }
       xla::XlaScopedShardingAssignment assign_tuple_sharding(builder,
                                                              tuple_sharding);
@@ -589,7 +686,8 @@ Status XlaCompiler::BuildArguments(
       tuple = xla::Parameter(builder, 0, (*input_shapes)[0], "arg_tuple");
     }
     for (std::vector<int>::size_type i = 0; i < input_mapping->size(); ++i) {
-      const int core = (*arg_cores)[input_mapping->at(i)];
+      auto it = arg_cores.find(i);
+      const int core = it == arg_cores.end() ? -1 : it->second;
       xla::XlaScopedShardingAssignment assign_sharding(
           builder, core == -1 ? absl::optional<xla::OpSharding>()
                               : xla::sharding_builder::AssignDevice(core));
@@ -597,7 +695,8 @@ Status XlaCompiler::BuildArguments(
     }
   } else {
     for (std::vector<int>::size_type i = 0; i < input_mapping->size(); ++i) {
-      const int core = (*arg_cores)[input_mapping->at(i)];
+      auto it = arg_cores.find(i);
+      const int core = it == arg_cores.end() ? -1 : it->second;
       xla::XlaScopedShardingAssignment assign_sharding(
           builder, core == -1 ? absl::optional<xla::OpSharding>()
                               : xla::sharding_builder::AssignDevice(core));
@@ -632,14 +731,14 @@ Status XlaCompiler::BuildArguments(
         // TODO(b/76097077): propagate device assignments onto arguments and
         // return values of functions, and then reshape unconditionally.
         if (is_entry_computation) {
-          arg_expression.set_handle(
-              xla::Reshape(arg_handles[i], arg.shape.dim_sizes()));
+          arg_expression = XlaExpression::XlaOp(
+              xla::Reshape(arg_handles[i], arg.shape.dim_sizes()), arg.type);
         } else {
-          arg_expression.set_handle(arg_handles[i]);
+          arg_expression = XlaExpression::XlaOp(arg_handles[i], arg.type);
         }
         break;
       case XlaCompiler::Argument::kToken: {
-        arg_expression.set_handle(arg_handles[i]);
+        arg_expression = XlaExpression::XlaOp(arg_handles[i], arg.type);
         break;
       }
       case XlaCompiler::Argument::kConstant:
@@ -653,46 +752,48 @@ Status XlaCompiler::BuildArguments(
 }
 
 Status XlaCompiler::CompileSingleOp(
-    const XlaCompiler::CompileOptions& options, string const& name,
-    OpKernelContext* ctx, const std::vector<XlaCompiler::Argument>& args,
-    CompilationResult* result) {
+    const XlaCompiler::CompileOptions& options, const NodeDef& node_def,
+    absl::Span<const XlaCompiler::Argument> args,
+    absl::Span<const DataType> result_types, CompilationResult* result) {
   // TODO(b/74182462): We implement this by creating a new dummy Graph including
   // _Arg nodes, and let CompileGraph walk it. This could be optimized.
   std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
 
   Status status;
   // First create the actual node we care about computing.
-  Node* main_node = graph->AddNode(ctx->op_kernel().def(), &status);
+  Node* main_node = graph->AddNode(node_def, &status);
   TF_RETURN_IF_ERROR(status);
 
   // Create dummy _Arg nodes. Link these to `node` and also via a control
   // dependency edge to the _SOURCE node.
-  for (int64 i = 0; i < ctx->num_inputs(); ++i) {
+  for (int64 i = 0; i < args.size(); ++i) {
     Node* node;
-    string name = absl::StrCat(ctx->op_kernel().name(), "_", i, "_arg");
-    Status status = NodeBuilder(name, "_Arg")
-                        .ControlInput(graph->source_node())
-                        .Attr("T", ctx->input_dtype(i))
-                        .Attr("index", i)
-                        .Finalize(graph.get(), &node);
+    string arg_name = absl::StrCat("_arg", i);
+    Status status =
+        NodeBuilder(arg_name, FunctionLibraryDefinition::kArgOp)
+            .ControlInput(graph->source_node())
+            .Attr("T", args[i].kind == Argument::kResource ? DT_RESOURCE
+                                                           : args[i].type)
+            .Attr("index", i)
+            .Finalize(graph.get(), &node);
     TF_RETURN_IF_ERROR(status);
     graph->AddEdge(node, 0, main_node, i);
   }
 
   // Similarly with return values, create dummy _Retval nodes fed by `node`.
-  for (int64 i = 0; i < ctx->num_outputs(); ++i) {
+  for (int64 i = 0; i < result_types.size(); ++i) {
     Node* node;
-    string name = absl::StrCat(ctx->op_kernel().name(), "_", i, "_retval");
-    Status status = NodeBuilder(name, "_Retval")
+    string retval_name = absl::StrCat("_retval", i);
+    Status status = NodeBuilder(retval_name, FunctionLibraryDefinition::kRetOp)
                         .Input(main_node, i)
-                        .Attr("T", ctx->expected_output_dtype(i))
+                        .Attr("T", result_types[i])
                         .Attr("index", i)
                         .Finalize(graph.get(), &node);
     TF_RETURN_IF_ERROR(status);
   }
   FixupSourceAndSinkEdges(graph.get());
 
-  return CompileGraph(options, name, std::move(graph), args, result);
+  return CompileGraph(options, node_def.name(), std::move(graph), args, result);
 }
 
 namespace {
@@ -747,12 +848,38 @@ Status ValidateGraph(const Graph* graph,
   return Status::OK();
 }
 
+// Converts the value of any expressions whose values are known at compile-time
+// to constants.
+Status ResolveConstantExpressionsToConstants(
+    xla::Client* client, absl::Span<XlaExpression> expressions) {
+  for (XlaExpression& expression : expressions) {
+    if (expression.kind() == XlaExpression::Kind::kXlaOp) {
+      TF_ASSIGN_OR_RETURN(absl::optional<Tensor> constant,
+                          expression.ResolveConstant(client));
+      if (constant.has_value()) {
+        expression = XlaExpression::Constant(*constant);
+      }
+    }
+  }
+  return Status::OK();
+}
+
+void ConvertConstantsToExpressions(xla::XlaBuilder* builder,
+                                   absl::Span<XlaExpression> expressions) {
+  for (XlaExpression& expression : expressions) {
+    if (expression.kind() == XlaExpression::Kind::kConstant) {
+      expression =
+          XlaExpression::XlaOp(expression.AsXlaOp(builder), expression.dtype());
+    }
+  }
+}
+
 }  // namespace
 
 Status XlaCompiler::CompileGraph(const XlaCompiler::CompileOptions& options,
                                  string const& name,
                                  std::unique_ptr<Graph> graph,
-                                 const std::vector<XlaCompiler::Argument>& args,
+                                 absl::Span<const XlaCompiler::Argument> args,
                                  CompilationResult* result) {
   VLOG(1) << "Executing graph symbolically to populate XlaBuilder.";
 
@@ -774,13 +901,12 @@ Status XlaCompiler::CompileGraph(const XlaCompiler::CompileOptions& options,
                                    options_.device_type, name));
 
   xla::XlaBuilder builder(name);
-  XlaContext* context = new XlaContext(
-      this, &builder, options_.allow_cpu_custom_calls,
-      options.resolve_compile_time_constants, options.is_entry_computation,
-      &options_.shape_representation_fn);
+  XlaContext* context =
+      new XlaContext(this, &builder, options_.allow_cpu_custom_calls,
+                     &options_.shape_representation_fn);
   core::ScopedUnref context_unref(context);
 
-  std::vector<XlaCompiler::Argument> real_args(args);
+  std::vector<XlaCompiler::Argument> real_args(args.begin(), args.end());
   int token_input_index = -1;
   std::unique_ptr<xla::XlaOp> token_output;
   if (options.add_token_input_output) {
@@ -792,10 +918,14 @@ Status XlaCompiler::CompileGraph(const XlaCompiler::CompileOptions& options,
     real_args.push_back(token_arg);
   }
 
+  std::map<int, int> arg_cores;
+  std::map<int, int> retval_cores;
+  TF_ASSIGN_OR_RETURN(std::tie(arg_cores, retval_cores),
+                      ComputeArgAndRetvalCores(*graph));
+
   std::vector<XlaExpression> arg_expressions;
-  std::vector<int> arg_cores;
   TF_RETURN_IF_ERROR(BuildArguments(
-      *graph, real_args, options.use_tuple_arg, &builder, context, &arg_cores,
+      *graph, real_args, options.use_tuple_arg, &builder, context, arg_cores,
       &arg_expressions, &result->input_mapping, &result->xla_input_shapes,
       options.is_entry_computation));
   context->set_args(std::move(arg_expressions));
@@ -843,9 +973,19 @@ Status XlaCompiler::CompileGraph(const XlaCompiler::CompileOptions& options,
   int num_computation_outputs;
   result->computation = std::make_shared<xla::XlaComputation>();
   result->outputs.resize(context->retvals().size());
+  std::vector<XlaExpression> retvals = context->retvals();
+  if (options.resolve_compile_time_constants) {
+    TF_RETURN_IF_ERROR(ResolveConstantExpressionsToConstants(
+        client(), absl::Span<XlaExpression>(retvals)));
+  } else {
+    ConvertConstantsToExpressions(&builder, absl::Span<XlaExpression>(retvals));
+  }
   TF_RETURN_IF_ERROR(BuildComputation(
-      real_args, arg_cores, context->retvals(), context->resources(),
-      std::move(token_output), options.return_updated_values_for_all_resources,
+      real_args, retvals, arg_cores, retval_cores, context->resources(),
+      std::move(token_output),
+      options.is_entry_computation ? options_.shape_representation_fn
+                                   : ShapeRepresentationFn{},
+      options.return_updated_values_for_all_resources,
       options.always_return_tuple, &builder, result->computation.get(),
       &num_computation_outputs, &num_nonconst_outputs, &result->outputs,
       &result->resource_updates));
diff --git a/tensorflow/compiler/tf2xla/xla_compiler.h b/tensorflow/compiler/tf2xla/xla_compiler.h
index 2cc603a5801..63426124686 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler.h
+++ b/tensorflow/compiler/tf2xla/xla_compiler.h
@@ -18,10 +18,13 @@ limitations under the License.
 
 #include <stack>
 
+#include "absl/types/span.h"
 #include "tensorflow/compiler/tf2xla/host_compute_metadata.pb.h"
 #include "tensorflow/compiler/tf2xla/xla_compilation_device.h"
+#include "tensorflow/compiler/tf2xla/xla_expression.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/core/common_runtime/device.h"
@@ -118,7 +121,7 @@ class XlaCompiler {
 
     // The type of the argument. If the argument is a resource, this
     // is the type of the variable's value, not DT_RESOURCE.
-    DataType type;
+    DataType type = DT_INVALID;
 
     // The shape of the argument. For:
     // * a parameter: the shape of the parameter.
@@ -155,6 +158,9 @@ class XlaCompiler {
     std::set<string> tensor_array_gradients;
 
     bool operator==(const Argument& other) const;
+
+    // Returns a human-readable summary of the argument.
+    string HumanString() const;
   };
 
   // Options pertaining to an individual call to CompileGraph() or
@@ -259,8 +265,7 @@ class XlaCompiler {
     std::shared_ptr<xla::XlaComputation> computation;
   };
 
-  typedef std::function<xla::StatusOr<TensorShape>(const TensorShape&,
-                                                   DataType)>
+  typedef std::function<xla::StatusOr<xla::Shape>(const TensorShape&, DataType)>
       ShapeRepresentationFn;
   struct Options {
     // Name of the compilation device to use. It must be set by the caller.
@@ -316,22 +321,23 @@ class XlaCompiler {
 
   Status CompileFunction(const CompileOptions& options,
                          const NameAttrList& fn_name_attrs,
-                         std::vector<Argument> args, CompilationResult* result);
+                         absl::Span<const Argument> args,
+                         CompilationResult* result);
 
   // Compiles a tensorflow::Graph into an xla::XlaComputation.
   // Similar to CompileFunction, but takes a Graph as input rather than a
   // function.
   Status CompileGraph(const CompileOptions& options, string const& name,
                       std::unique_ptr<Graph> graph,
-                      const std::vector<Argument>& args,
+                      absl::Span<const Argument> args,
                       CompilationResult* result);
 
-  // Compiles a single Op, given by an OpKernelContext, into an
+  // Compiles a single Op, given by `node_def`, into an
   // xla::XlaComputation. Similar to CompileFunction but takes a single Op as
   // input.
-  Status CompileSingleOp(const CompileOptions& options, string const& name,
-                         OpKernelContext* ctx,
-                         const std::vector<Argument>& args,
+  Status CompileSingleOp(const CompileOptions& options, const NodeDef& node_def,
+                         absl::Span<const Argument> args,
+                         absl::Span<const DataType> result_types,
                          CompilationResult* result);
 
   // Returns the shape of the XLA parameter for an argument 'arg'.
@@ -411,7 +417,8 @@ class XlaCompiler {
   Status BuildArguments(const Graph& graph,
                         const std::vector<XlaCompiler::Argument>& args,
                         bool use_tuple_arg, xla::XlaBuilder* builder,
-                        XlaContext* context, std::vector<int>* arg_cores,
+                        XlaContext* context,
+                        const std::map<int, int>& arg_cores,
                         std::vector<XlaExpression>* arg_expressions,
                         std::vector<int>* input_mapping,
                         std::vector<xla::Shape>* input_shapes,
diff --git a/tensorflow/compiler/tf2xla/xla_compiler_test.cc b/tensorflow/compiler/tf2xla/xla_compiler_test.cc
index 4ef154f856b..aaee208f634 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler_test.cc
+++ b/tensorflow/compiler/tf2xla/xla_compiler_test.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/cc/ops/resource_variable_ops.h"
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/compiler/tf2xla/side_effect_util.h"
+#include "tensorflow/compiler/tf2xla/type_util.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
@@ -1018,9 +1019,11 @@ TEST_F(XlaCompilerTest, VariableRepresentationShapeFunction) {
 
   // Compiles the graph.
   XlaCompiler::Options options = DefaultOptions();
-  options.shape_representation_fn = [](const TensorShape& shape,
-                                       DataType type) {
-    return TensorShape({shape.num_elements()});
+  options.shape_representation_fn =
+      [](const TensorShape& shape, DataType type) -> xla::StatusOr<xla::Shape> {
+    xla::PrimitiveType ptype;
+    TF_RETURN_IF_ERROR(DataTypeToPrimitiveType(type, &ptype));
+    return xla::ShapeUtil::MakeShape(ptype, {shape.num_elements()});
   };
   XlaCompiler compiler(options);
 
@@ -1086,9 +1089,11 @@ TEST_F(XlaCompilerTest, ArgRetvalShapeRepresentationFunction) {
 
   // Compiles the graph.
   XlaCompiler::Options options = DefaultOptions();
-  options.shape_representation_fn = [](const TensorShape& shape,
-                                       DataType type) {
-    return TensorShape({shape.num_elements()});
+  options.shape_representation_fn =
+      [](const TensorShape& shape, DataType type) -> xla::StatusOr<xla::Shape> {
+    xla::PrimitiveType ptype;
+    TF_RETURN_IF_ERROR(DataTypeToPrimitiveType(type, &ptype));
+    return xla::ShapeUtil::MakeShape(ptype, {shape.num_elements()});
   };
   XlaCompiler compiler(options);
 
diff --git a/tensorflow/compiler/tf2xla/xla_context.cc b/tensorflow/compiler/tf2xla/xla_context.cc
index 20e1ee2ddb3..43095fbb473 100644
--- a/tensorflow/compiler/tf2xla/xla_context.cc
+++ b/tensorflow/compiler/tf2xla/xla_context.cc
@@ -64,63 +64,23 @@ void XlaContext::set_args(std::vector<XlaExpression> args) {
 
 XlaContext::XlaContext(
     XlaCompiler* compiler, xla::XlaBuilder* builder,
-    bool allow_cpu_custom_calls, bool resolve_compile_time_constants,
-    bool is_entry_computation,
-    const std::function<xla::StatusOr<TensorShape>(
+    bool allow_cpu_custom_calls,
+    const std::function<xla::StatusOr<xla::Shape>(
         const TensorShape&, DataType)>* shape_representation_fn)
     : compiler_(compiler),
       builder_(builder),
       allow_cpu_custom_calls_(allow_cpu_custom_calls),
-      resolve_compile_time_constants_(resolve_compile_time_constants),
-      is_entry_computation_(is_entry_computation),
       shape_representation_fn_(shape_representation_fn) {}
 
 string XlaContext::DebugString() { return "TLA JIT context"; }
 
-// This is called by the Retval Op to associate a computed value
-// with a specific return value of the subgraph.
-void XlaContext::AddRetval(int retval_index, DataType type,
-                           const TensorShape& shape, const xla::XlaOp& handle) {
-  VLOG(1) << "Added retval index " << retval_index << " to XLA computation";
-  // Add the return value to the list being built up.
-  if (retvals_.size() <= retval_index) {
-    retvals_.resize(retval_index + 1);
+void XlaContext::SetRetval(int index, const XlaExpression& expression) {
+  if (retvals_.size() <= index) {
+    retvals_.resize(index + 1);
   }
-  XlaExpression e;
-  e.set_handle(handle);
-  retvals_[retval_index] = Retval{type, shape, e};
+  retvals_[index] = expression;
 }
 
-Status XlaContext::AddConstRetval(int retval_index, DataType dtype,
-                                  const xla::LiteralSlice& literal) {
-  VLOG(1) << "Adding retval index " << retval_index
-          << " with non-data-dependent tensor to XLA computation";
-  if (retvals_.size() <= retval_index) {
-    retvals_.resize(retval_index + 1);
-  }
-  Tensor value;
-  TF_RETURN_IF_ERROR(LiteralToHostTensor(literal, dtype, &value));
-  XlaExpression e;
-  e.set_constant_value(value);
-  retvals_[retval_index] = Retval{dtype, value.shape(), e};
-  return Status::OK();
-}
-
-Status XlaContext::AddResourceRetval(int retval_index, XlaResource* resource) {
-  VLOG(1) << "Adding retval index " << retval_index << " with resource "
-          << resource->name() << ":" << resource->shape().DebugString()
-          << " to XLA computation";
-  if (retvals_.size() <= retval_index) {
-    retvals_.resize(retval_index + 1);
-  }
-  XlaExpression e;
-  e.set_resource(resource);
-  retvals_[retval_index] = Retval{DT_RESOURCE, resource->shape(), e};
-  return Status::OK();
-}
-
-xla::XlaBuilder* XlaContext::builder() { return builder_; }
-
 Status XlaContext::CreateResource(
     XlaResource::Kind kind, int arg_num, string name, DataType type,
     TensorShape shape, const xla::XlaOp& handle, int64 tensor_array_size,
@@ -133,7 +93,7 @@ Status XlaContext::CreateResource(
   return Status::OK();
 }
 
-xla::StatusOr<TensorShape> XlaContext::RepresentationShape(
+xla::StatusOr<xla::Shape> XlaContext::RepresentationShape(
     const TensorShape& shape, DataType type) const {
   return (*shape_representation_fn_)(shape, type);
 }
diff --git a/tensorflow/compiler/tf2xla/xla_context.h b/tensorflow/compiler/tf2xla/xla_context.h
index 4da891634e9..dbfd344c9ba 100644
--- a/tensorflow/compiler/tf2xla/xla_context.h
+++ b/tensorflow/compiler/tf2xla/xla_context.h
@@ -20,8 +20,8 @@ limitations under the License.
 
 #include <vector>
 
-#include "tensorflow/compiler/tf2xla/xla_compilation_device.h"
 #include "tensorflow/compiler/tf2xla/xla_compiler.h"
+#include "tensorflow/compiler/tf2xla/xla_expression.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/status_macros.h"
@@ -46,9 +46,8 @@ class XlaContext : public ResourceBase {
   // Creates a new XlaContext. See the documentation on the class data fields
   // for descriptions of the arguments.
   XlaContext(XlaCompiler* compiler, xla::XlaBuilder* builder,
-             bool allow_cpu_custom_calls, bool resolve_compile_time_constants,
-             bool is_entry_computation,
-             const std::function<xla::StatusOr<TensorShape>(
+             bool allow_cpu_custom_calls,
+             const std::function<xla::StatusOr<xla::Shape>(
                  const TensorShape&, DataType)>* shape_representation_fn);
 
   // Virtual method defined by ResourceBase.
@@ -57,37 +56,19 @@ class XlaContext : public ResourceBase {
   XlaCompiler* compiler() const { return compiler_; }
 
   // Returns the XlaBuilder that Ops use for compiling new expressions.
-  xla::XlaBuilder* builder();
+  xla::XlaBuilder* builder() { return builder_; }
 
   bool allow_cpu_custom_calls() const { return allow_cpu_custom_calls_; }
 
-  bool resolve_compile_time_constants() const {
-    return resolve_compile_time_constants_;
-  }
-  bool is_entry_computation() const { return is_entry_computation_; }
-
   const std::vector<XlaExpression>& args() const { return args_; }
   void set_args(std::vector<XlaExpression> args);
 
-  struct Retval {
-    DataType type;
-    TensorShape shape;
-    // An XlaExpression representing the Retval's value.
-    XlaExpression expression;
-  };
-  const std::vector<Retval>& retvals() { return retvals_; }
+  const std::vector<XlaExpression>& retvals() { return retvals_; }
 
-  // This is called by the Retval Op to associate a computed value
-  // with a specific return value of the subgraph.
-  void AddRetval(int retval_index, DataType type, const TensorShape& shape,
-                 const xla::XlaOp& handle);
-
-  // As for Retval, but for return values that are compile-time constants.
-  Status AddConstRetval(int retval_index, DataType dtype,
-                        const xla::LiteralSlice& literal);
-
-  // As for Retval, but for return values that are resource handles.
-  Status AddResourceRetval(int retval_index, XlaResource* resource);
+  // Sets a return value.
+  // Since we do not always know in advance how many return values there are,
+  // grows the return values vector to size index+1 if it is smaller.
+  void SetRetval(int index, const XlaExpression& expression);
 
   // Creates a resource with resource `kind` and initial value `handle`. `name`
   // is a descriptive name for use in error messages. See the `XlaResource`
@@ -105,8 +86,8 @@ class XlaContext : public ResourceBase {
 
   // Returns the XLA shape to be used to represent a variable of TF `shape`
   // and `type`, or of an argument or return value of a top-level computation.
-  xla::StatusOr<TensorShape> RepresentationShape(const TensorShape& shape,
-                                                 DataType type) const;
+  xla::StatusOr<xla::Shape> RepresentationShape(const TensorShape& shape,
+                                                DataType type) const;
 
   // Get an XLA lambda to compute Max. This is cached in the
   // XlaContext since it may be used by multiple Ops. There is a
@@ -140,31 +121,19 @@ class XlaContext : public ResourceBase {
   // Allow ops to emit CustomCall operations for CPU.
   const bool allow_cpu_custom_calls_;
 
-  // If true, constant return values are returned as Tensors instead of
-  // run-time computation outputs.
-  const bool resolve_compile_time_constants_;
-
   // Arguments to the Tensorflow graph, indexed by _Arg index.
   // Includes both compile-time constant arguments and runtime parameters.
   std::vector<XlaExpression> args_;
 
   // Return values of the Tensorflow graph, indexed by _Retval index.
-  std::vector<Retval> retvals_;
+  std::vector<XlaExpression> retvals_;
 
   // Holds ownership of resources. The resources are not ordered.
   std::vector<std::unique_ptr<XlaResource>> resources_;
 
-  // Is this a top-level computation, or an inner computation (e.g., a while
-  // body)?
-  const bool is_entry_computation_;
-
-  // A function that describes how the shapes of
-  // a) argument and return value, for entry computations
-  // b) variables, for all computations,
-  // should be represented in XLA. Parameters/return values will be shaped
-  // according to this function, and reshaped back to/from their declared shapes
-  // for computations. Must be non-null.
-  const std::function<xla::StatusOr<TensorShape>(const TensorShape&, DataType)>*
+  // Describes the on-host shapes of parameters and return values. Also see:
+  // XlaDevice::Options::shape_representation_fn.
+  const std::function<xla::StatusOr<xla::Shape>(const TensorShape&, DataType)>*
       shape_representation_fn_;
 
   // Cache of prebuilt computations indexed by their type.
diff --git a/tensorflow/compiler/tf2xla/xla_expression.cc b/tensorflow/compiler/tf2xla/xla_expression.cc
new file mode 100644
index 00000000000..ca0309166b7
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/xla_expression.cc
@@ -0,0 +1,145 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2xla/xla_expression.h"
+
+#include "tensorflow/compiler/tf2xla/literal_util.h"
+#include "tensorflow/compiler/tf2xla/shape_util.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/lib/core/errors.h"
+
+namespace tensorflow {
+
+XlaExpression::XlaExpression() = default;
+
+XlaExpression XlaExpression::Invalid() {
+  XlaExpression e;
+  e.kind_ = Kind::kInvalid;
+  return e;
+}
+
+XlaExpression XlaExpression::Constant(Tensor value) {
+  XlaExpression e;
+  e.kind_ = Kind::kConstant;
+  e.dtype_ = value.dtype();
+  e.constant_value_ = value;
+  return e;
+}
+
+XlaExpression XlaExpression::XlaOp(xla::XlaOp value, DataType dtype) {
+  XlaExpression e;
+  e.kind_ = Kind::kXlaOp;
+  e.dtype_ = dtype;
+  e.handle_ = value;
+  return e;
+}
+
+XlaExpression XlaExpression::Resource(XlaResource* resource) {
+  XlaExpression e;
+  e.kind_ = Kind::kResource;
+  e.dtype_ = DT_RESOURCE;
+  e.resource_ = resource;
+  return e;
+}
+
+string XlaExpression::HumanString() const {
+  switch (kind_) {
+    case Kind::kInvalid:
+      return "invalid";
+    case Kind::kConstant:
+      return "constant";
+    case Kind::kXlaOp:
+      return "xla_op";
+    case Kind::kResource:
+      return "resource";
+  }
+}
+
+xla::XlaOp XlaExpression::AsXlaOp(xla::XlaBuilder* builder) const {
+  return builder->ReportErrorOrReturn([&]() -> xla::StatusOr<xla::XlaOp> {
+    switch (kind_) {
+      case Kind::kConstant: {
+        xla::BorrowingLiteral literal;
+        TF_RETURN_IF_ERROR(
+            HostTensorToBorrowingLiteral(constant_value_, &literal));
+        return xla::ConstantLiteral(builder, literal);
+      }
+      case Kind::kXlaOp:
+        if (builder != handle_.builder()) {
+          return errors::InvalidArgument(
+              "Mismatched builders in XlaExpression::AsXlaOp");
+        }
+        return handle_;
+      default:
+        return errors::InvalidArgument("AsXlaOp called on XlaExpression: ",
+                                       HumanString());
+    }
+  });
+}
+
+xla::StatusOr<absl::optional<Tensor>> XlaExpression::ResolveConstant(
+    xla::Client* client) const {
+  switch (kind()) {
+    case Kind::kConstant:
+      return {constant_value()};
+    case Kind::kXlaOp:
+      break;
+    case Kind::kResource:
+    case Kind::kInvalid:
+      return errors::InvalidArgument(
+          "ResolveConstant called on XlaExpression: ", HumanString());
+  }
+
+  TF_ASSIGN_OR_RETURN(bool is_constant,
+                      handle().builder()->IsConstant(handle()));
+  if (!is_constant) return {absl::nullopt};
+
+  TF_ASSIGN_OR_RETURN(xla::XlaComputation constant_graph,
+                      handle().builder()->BuildConstantSubGraph(handle()));
+
+  TF_ASSIGN_OR_RETURN(TensorShape shape, GetShape());
+
+  // The XLA layout is specified minor to major, and TensorFlow uses a major to
+  // minor order.
+  std::vector<int64> layout_indices(shape.dims());
+  std::iota(layout_indices.rbegin(), layout_indices.rend(), 0);
+  xla::Layout layout = xla::LayoutUtil::MakeLayout(layout_indices);
+  TF_ASSIGN_OR_RETURN(xla::Literal literal,
+                      client->ComputeConstant(constant_graph, &layout));
+  Tensor tensor;
+  TF_RETURN_IF_ERROR(LiteralToHostTensor(literal, dtype(), &tensor));
+  return {tensor};
+}
+
+xla::StatusOr<TensorShape> XlaExpression::GetShape() const {
+  switch (kind_) {
+    case Kind::kConstant:
+      return constant_value().shape();
+    case Kind::kXlaOp: {
+      TF_ASSIGN_OR_RETURN(xla::Shape xla_shape,
+                          handle().builder()->GetShape(handle()));
+      TensorShape shape;
+      TF_RETURN_IF_ERROR(XLAShapeToTensorShape(xla_shape, &shape));
+      return shape;
+    }
+    case Kind::kResource:
+      return TensorShape({});
+    case Kind::kInvalid:
+      return errors::InvalidArgument(
+          "GetShape() called on invalid XlaExpression");
+  }
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/xla_expression.h b/tensorflow/compiler/tf2xla/xla_expression.h
new file mode 100644
index 00000000000..bed6761d362
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/xla_expression.h
@@ -0,0 +1,115 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_TF2XLA_XLA_EXPRESSION_H_
+#define TENSORFLOW_COMPILER_TF2XLA_XLA_EXPRESSION_H_
+
+#include "absl/types/optional.h"
+#include "tensorflow/compiler/tf2xla/xla_resource.h"
+#include "tensorflow/compiler/xla/client/client.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+
+// A XlaExpression represents a symbolic TensorFlow value in a TF->XLA
+// compilation.
+// An expression is one of:
+// * a constant tensor.
+// * an xla::XlaOp, representing a symbolic XLA value.
+// * a resource, e.g., a variable, represented as an XlaResource pointer.
+//
+// Constant tensors are mostly an optimization to avoid passing large constants
+// to XLA, but are also sometimes used to represent tensors that have no XLA
+// representation, for example, DT_STRING tensors. A canonical use case might be
+// an error message string.
+class XlaExpression {
+ public:
+  enum class Kind {
+    kInvalid,
+    kConstant,
+    kXlaOp,
+    kResource,
+  };
+
+  XlaExpression();
+  XlaExpression(const XlaExpression&) = default;
+  XlaExpression& operator=(const XlaExpression&) = default;
+
+  // Builds an invalid expression. (Same as the default constructor, but makes
+  // the intent clearer.)
+  static XlaExpression Invalid();
+
+  // Builds a constant XLA expression.
+  static XlaExpression Constant(Tensor value);
+
+  // Builds a XlaOp expression. Since the mapping from TF data types to XLA
+  // types is not 1-1, the TF type must also be provided; in general it cannot
+  // be derived from the XLA type.
+  static XlaExpression XlaOp(xla::XlaOp value, DataType dtype);
+
+  // Builds a resource expression.
+  static XlaExpression Resource(XlaResource* resource);
+
+  Kind kind() const { return kind_; }
+
+  DataType dtype() const { return dtype_; }
+
+  // handle() returns the XlaOp that backs a kXlaOp expression.
+  const xla::XlaOp& handle() const { return handle_; }
+
+  const Tensor& constant_value() const { return constant_value_; }
+
+  XlaResource* resource() const { return resource_; }
+
+  // Returns a human-readable summary of the expression.
+  string HumanString() const;
+
+  // Returns the value of a kConstant or kXlaOp as an xla::XlaOp. Returns
+  // an erroneous XlaOp if the expression is not a constant or an expression.
+  xla::XlaOp AsXlaOp(xla::XlaBuilder* builder) const;
+
+  // If a kXlaOp or kConstant expression can be resolved to a compile-time
+  // constant, returns the value as a host-memory Tensor. Returns an empty
+  // optional if it cannot be resolved. Returns an error if passed a resource
+  // expression.
+  xla::StatusOr<absl::optional<Tensor>> ResolveConstant(
+      xla::Client* client) const;
+
+  // Returns the shape of the tensor.
+  // The shape of a resource is the shape of a resource handle (i.e., a scalar),
+  // not the shape of the resource's value.
+  xla::StatusOr<TensorShape> GetShape() const;
+
+ private:
+  Kind kind_ = Kind::kInvalid;
+
+  DataType dtype_ = DT_INVALID;
+
+  // The XLA handle of the expression's computation, if kind_ == kXlaOp.
+  xla::XlaOp handle_;
+
+  // The value of the constant, if kind_ == kConstant.
+  Tensor constant_value_;
+
+  // The resource, if kind_ == kResource. Not owned.
+  XlaResource* resource_ = nullptr;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_TF2XLA_XLA_EXPRESSION_H_
diff --git a/tensorflow/compiler/tf2xla/xla_expression_test.cc b/tensorflow/compiler/tf2xla/xla_expression_test.cc
new file mode 100644
index 00000000000..84202c93139
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/xla_expression_test.cc
@@ -0,0 +1,135 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+
+#include "absl/memory/memory.h"
+#include "tensorflow/compiler/tf2xla/xla_expression.h"
+#include "tensorflow/compiler/tf2xla/xla_resource.h"
+#include "tensorflow/compiler/xla/client/client_library.h"
+#include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/literal.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/tests/literal_test_util.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace {
+
+class XlaExpressionTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    client_ = xla::ClientLibrary::LocalClientOrDie();
+    builder_ = absl::make_unique<xla::XlaBuilder>("acomputation");
+    constant_ = test::AsScalar<int32>(42);
+    op_ = xla::ConstantR0<int32>(builder_.get(), 7);
+    non_constant_op_ = xla::Parameter(
+        builder_.get(), 0, xla::ShapeUtil::MakeShape(xla::F32, {}), "x");
+    resource_ = absl::make_unique<XlaResource>(
+        XlaResource::kVariable, /*arg_num=*/0, /*name=*/string("avariable"),
+        DT_INT32, TensorShape({17, 3}), op_, /*tensor_array_size=*/-1,
+        /*tensor_array_gradients=*/std::set<string>(),
+        /*tensor_array_multiple_writes_aggregate=*/false);
+  }
+
+  xla::Client* client_;
+  std::unique_ptr<xla::XlaBuilder> builder_;
+  Tensor constant_;
+  xla::XlaOp op_;
+  xla::XlaOp non_constant_op_;
+  std::unique_ptr<XlaResource> resource_;
+};
+
+TEST_F(XlaExpressionTest, Kind) {
+  EXPECT_TRUE(XlaExpression::Kind::kInvalid == XlaExpression().kind());
+  EXPECT_TRUE(XlaExpression::Kind::kInvalid == XlaExpression::Invalid().kind());
+  EXPECT_TRUE(XlaExpression::Kind::kConstant ==
+              XlaExpression::Constant(constant_).kind());
+  EXPECT_TRUE(XlaExpression::Kind::kXlaOp ==
+              XlaExpression::XlaOp(op_, DT_INT32).kind());
+  EXPECT_TRUE(XlaExpression::Kind::kResource ==
+              XlaExpression::Resource(resource_.get()).kind());
+}
+
+TEST_F(XlaExpressionTest, HumanString) {
+  EXPECT_EQ("invalid", XlaExpression().HumanString());
+  EXPECT_EQ("invalid", XlaExpression::Invalid().HumanString());
+  EXPECT_EQ("constant", XlaExpression::Constant(constant_).HumanString());
+  EXPECT_EQ("xla_op", XlaExpression::XlaOp(op_, DT_INT32).HumanString());
+  EXPECT_EQ("resource", XlaExpression::Resource(resource_.get()).HumanString());
+}
+
+TEST_F(XlaExpressionTest, AsXlaOp) {
+  xla::XlaOp op_as_op =
+      XlaExpression::XlaOp(op_, DT_INT32).AsXlaOp(builder_.get());
+  EXPECT_TRUE(op_.IsIdenticalTo(op_as_op));
+
+  xla::XlaOp const_as_op =
+      XlaExpression::Constant(constant_).AsXlaOp(builder_.get());
+  TF_ASSERT_OK_AND_ASSIGN(xla::XlaComputation computation,
+                          builder_->BuildConstantSubGraph(const_as_op));
+  TF_ASSERT_OK_AND_ASSIGN(xla::Literal value,
+                          client_->ComputeConstant(computation));
+  EXPECT_TRUE(xla::LiteralTestUtil::Equal(xla::LiteralUtil::CreateR0<int32>(42),
+                                          value));
+}
+
+TEST_F(XlaExpressionTest, GetShape) {
+  EXPECT_FALSE(XlaExpression().GetShape().ok());
+  EXPECT_FALSE(XlaExpression::Invalid().GetShape().ok());
+
+  TF_ASSERT_OK_AND_ASSIGN(TensorShape resource_shape,
+                          XlaExpression::Resource(resource_.get()).GetShape());
+  EXPECT_EQ(TensorShape({}), resource_shape);
+
+  TF_ASSERT_OK_AND_ASSIGN(TensorShape op_shape,
+                          XlaExpression::XlaOp(op_, DT_INT32).GetShape());
+  EXPECT_EQ(TensorShape({}), op_shape);
+
+  TF_ASSERT_OK_AND_ASSIGN(TensorShape constant_shape,
+                          XlaExpression::Constant(constant_).GetShape());
+  EXPECT_EQ(TensorShape({}), constant_shape);
+}
+
+TEST_F(XlaExpressionTest, ResolveConstant) {
+  EXPECT_FALSE(XlaExpression().ResolveConstant(client_).ok());
+  EXPECT_FALSE(XlaExpression::Invalid().ResolveConstant(client_).ok());
+  EXPECT_FALSE(
+      XlaExpression::Resource(resource_.get()).ResolveConstant(client_).ok());
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      absl::optional<Tensor> op_constant,
+      XlaExpression::XlaOp(op_, DT_INT32).ResolveConstant(client_));
+  ASSERT_TRUE(op_constant.has_value());
+  test::ExpectTensorEqual<int32>(test::AsScalar<int32>(7), *op_constant);
+
+  TF_ASSERT_OK_AND_ASSIGN(absl::optional<Tensor> op_nonconstant,
+                          XlaExpression::XlaOp(non_constant_op_, DT_FLOAT)
+                              .ResolveConstant(client_));
+  EXPECT_FALSE(op_nonconstant.has_value());
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      absl::optional<Tensor> constant_constant,
+      XlaExpression::Constant(constant_).ResolveConstant(client_));
+  ASSERT_TRUE(constant_constant.has_value());
+  test::ExpectTensorEqual<int32>(constant_, *constant_constant);
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/xla_op_kernel.cc b/tensorflow/compiler/tf2xla/xla_op_kernel.cc
index dd3498ef7aa..8dd8def0549 100644
--- a/tensorflow/compiler/tf2xla/xla_op_kernel.cc
+++ b/tensorflow/compiler/tf2xla/xla_op_kernel.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/literal_util.h"
 #include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/tf2xla/type_util.h"
+#include "tensorflow/compiler/tf2xla/xla_compilation_device.h"
 #include "tensorflow/compiler/tf2xla/xla_context.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/client/xla_computation.h"
@@ -43,32 +44,36 @@ xla::XlaBuilder* XlaOpKernelContext::builder() const {
 static const XlaExpression* CastExpressionFromTensor(const Tensor& tensor) {
   const XlaExpression* expression =
       reinterpret_cast<const XlaExpression*>(tensor.tensor_data().data());
-  CHECK(expression->handle().valid() || expression->resource() != nullptr);
-  VLOG(1) << "Fetched T" << expression->handle();
+  CHECK(expression->kind() != XlaExpression::Kind::kInvalid)
+      << expression->HumanString();
   return expression;
 }
 
-// Retrieves an uninitialized XlaExpression from a newly-allocated tensor.
-static XlaExpression* CastExpressionFromUninitializedTensor(Tensor* tensor) {
+// Assigns an XlaExpression to a tensor on an XLA compilation device.
+static void AssignExpressionToTensor(Tensor* tensor,
+                                     const XlaExpression& value) {
   const XlaExpression* expression =
       reinterpret_cast<const XlaExpression*>(tensor->tensor_data().data());
-  CHECK(!expression->handle().valid());
-  return const_cast<XlaExpression*>(expression);
+  CHECK(expression->kind() == XlaExpression::Kind::kInvalid)
+      << expression->HumanString();
+  *const_cast<XlaExpression*>(expression) = value;
 }
 
-// Retrieves the XlaOp from an input Tensor to an Op. This computation was
-// constructed by an Op that executed previously and created the output Tensor
-// using CreateOutputTensorFromComputation or CreateConstantOutputTensor.
-static const xla::XlaOp& GetComputationFromTensor(const Tensor& tensor) {
-  return CastExpressionFromTensor(tensor)->handle();
+const XlaExpression& XlaOpKernelContext::InputExpression(int index) {
+  return *CastExpressionFromTensor(context_->input(index));
 }
 
-const xla::XlaOp& XlaOpKernelContext::Input(int index) {
-  return GetComputationFromTensor(context_->input(index));
+const XlaExpression& XlaOpKernelContext::InputExpression(
+    absl::string_view name) {
+  return *CastExpressionFromTensor(GetInputTensorByName(name));
 }
 
-const xla::XlaOp& XlaOpKernelContext::Input(absl::string_view name) {
-  return GetComputationFromTensor(GetInputTensorByName(name));
+xla::XlaOp XlaOpKernelContext::Input(int index) {
+  return InputExpression(index).AsXlaOp(builder());
+}
+
+xla::XlaOp XlaOpKernelContext::Input(absl::string_view name) {
+  return InputExpression(name).AsXlaOp(builder());
 }
 
 TensorShape XlaOpKernelContext::InputShape(int index) {
@@ -125,77 +130,18 @@ Status XlaOpKernelContext::ConstantInput(absl::string_view name,
 Status XlaOpKernelContext::ConstantInputReshaped(
     int index, absl::Span<const int64> new_dims,
     xla::Literal* constant_literal) {
-  const Tensor& tensor = context_->input(index);
-  TensorShape new_shape(new_dims);
-  if (tensor.NumElements() != new_shape.num_elements()) {
-    return errors::InvalidArgument(
-        context_->op_kernel().name(), " input ", index, " has shape ",
-        tensor.shape().DebugString(),
-        " but was asked to be reshaped to incompatible shape ",
-        new_shape.DebugString());
-  }
-  const XlaExpression* expression = CastExpressionFromTensor(tensor);
-
-  auto copy_tensor_to_literal = [](const Tensor& tensor,
-                                   xla::Literal* literal) {
-    xla::Shape literal_shape;
-    TF_RETURN_IF_ERROR(
-        TensorShapeToXLAShape(tensor.dtype(), tensor.shape(), &literal_shape));
-
-    *literal = xla::Literal(literal_shape);
-
-    // memcpy over the payload ...
-    // TODO(phawkins): handle string types.
-    size_t total_bytes = tensor.TotalBytes();
-    if (total_bytes > 0) {
-      void* dst_ptr = literal->untyped_data();
-      const void* src_ptr = DMAHelper::base(&tensor);
-      memcpy(dst_ptr, src_ptr, total_bytes);
-    }
-    return Status::OK();
-  };
-
-  // If the tensor has a known constant value, there is no need to invoke XLA.
-  if (expression->has_constant_value()) {
-    Tensor temp(tensor.dtype());
-    if (!temp.CopyFrom(expression->constant_value(), new_shape)) {
-      // This should never happen. The constant should have a shape compatible
-      // with the enclosing Tensor.
-      return errors::Internal("Incompatible shapes in ConstantInputReshaped.");
-    }
-
-    return copy_tensor_to_literal(temp, constant_literal);
-  }
-
-  // Make sure we treat zero-element tensors as constant.
-  if (new_shape.num_elements() == 0) {
-    Tensor temp(tensor.dtype(), new_shape);
-
-    return copy_tensor_to_literal(temp, constant_literal);
-  }
-
-  xla::XlaOp handle = expression->handle();
-  if (new_shape != tensor.shape()) {
-    // Reshape the handle to the desired shape.
-    handle = xla::Reshape(handle, new_shape.dim_sizes());
-  }
-
-  // The XLA layout is specified minor to major, and TensorFlow's minor
-  // dimension is the last one.
-  std::vector<int64> layout_indices(new_shape.dims());
-  std::iota(layout_indices.rbegin(), layout_indices.rend(), 0);
-  xla::Layout layout = xla::LayoutUtil::MakeLayout(layout_indices);
-
-  xla::StatusOr<bool> is_constant = builder()->IsConstant(handle);
-  if (!is_constant.ok()) {
-    Status status = is_constant.status();
+  XlaExpression e = InputExpression(index);
+  xla::StatusOr<absl::optional<Tensor>> constant_or_status =
+      e.ResolveConstant(compiler()->client());
+  if (!constant_or_status.ok()) {
+    Status status = constant_or_status.status();
     errors::AppendToMessage(&status, "while evaluating input ", index, " of ",
                             context_->op_kernel().type_string(),
                             " operator as a compile-time constant.");
     return status;
   }
-
-  if (!is_constant.ValueOrDie()) {
+  absl::optional<Tensor> constant = constant_or_status.ValueOrDie();
+  if (!constant.has_value()) {
     return errors::InvalidArgument(
         "Input ", index, " to ", context_->op_kernel().type_string(),
         " operator must be a compile-time constant.\n"
@@ -208,25 +154,16 @@ Status XlaOpKernelContext::ConstantInputReshaped(
         "stateful operation such as a random number generator.");
   }
 
-  // Ask the XLA compiler to evaluate the data handle to a literal.
-  xla::StatusOr<xla::XlaComputation> constant_graph =
-      builder()->BuildConstantSubGraph(handle);
-  if (!constant_graph.ok()) {
-    return errors::Internal(
-        "Error getting a compile-time constant graph for ",
-        context_->op_kernel().name(), " input ", index,
-        ".\nError: ", constant_graph.status().error_message());
+  Tensor temp(constant->dtype());
+  if (!temp.CopyFrom(*constant, TensorShape(new_dims))) {
+    return errors::InvalidArgument(
+        context_->op_kernel().name(), " input ", index, " has shape ",
+        constant->shape().DebugString(),
+        " but was asked to be reshaped to incompatible shape ",
+        TensorShape(new_dims).DebugString());
   }
-  xla::StatusOr<xla::Literal> computed = compiler()->client()->ComputeConstant(
-      constant_graph.ValueOrDie(), &layout);
-  if (!computed.ok()) {
-    return errors::Internal("Error evaluating ", context_->op_kernel().name(),
-                            " input ", index,
-                            " as a compile-time constant.\nError: ",
-                            computed.status().error_message());
-  }
-  *constant_literal = std::move(computed).ValueOrDie();
 
+  TF_ASSIGN_OR_RETURN(*constant_literal, HostTensorToLiteral(temp));
   return Status::OK();
 }
 
@@ -322,6 +259,15 @@ Status XlaOpKernelContext::ConstantInputReshapedToIntVector(
   return LiteralToInt64Vector(literal, out);
 }
 
+Status XlaOpKernelContext::ConstantInputReshapedToIntVector(
+    absl::string_view name, std::vector<int64>* out) {
+  TF_ASSIGN_OR_RETURN(int index, InputIndex(this, name));
+  xla::Literal literal;
+  TF_RETURN_IF_ERROR(ConstantInputReshaped(
+      index, {InputShape(index).num_elements()}, &literal));
+  return LiteralToInt64Vector(literal, out);
+}
+
 Status XlaOpKernelContext::ConstantInputAsInt64Literal(int index,
                                                        xla::Literal* out) {
   xla::Literal literal;
@@ -372,7 +318,7 @@ Status XlaOpKernelContext::InputList(absl::string_view name,
   handles->clear();
   shapes->clear();
   for (const Tensor& input : inputs) {
-    handles->push_back(GetComputationFromTensor(input));
+    handles->push_back(CastExpressionFromTensor(input)->AsXlaOp(builder()));
     shapes->push_back(input.shape());
   }
   return Status::OK();
@@ -413,9 +359,12 @@ Status ReadVariableInputTensor(const Tensor& tensor, DataType type,
 
   XlaContext& xla_context = XlaContext::Get(ctx);
   TF_ASSIGN_OR_RETURN(
-      TensorShape representation_shape,
+      xla::Shape representation_shape,
       xla_context.RepresentationShape(variable->shape(), variable->type()));
-  if (representation_shape == variable->shape()) {
+  xla::Shape xla_shape;
+  TF_RETURN_IF_ERROR(
+      TensorShapeToXLAShape(variable->type(), variable->shape(), &xla_shape));
+  if (xla::ShapeUtil::Compatible(xla_shape, representation_shape)) {
     *value = variable->value();
   } else {
     *value = xla::Reshape(variable->value(), variable->shape().dim_sizes());
@@ -455,90 +404,53 @@ Status XlaOpKernelContext::GetVariableTypeAndShape(int index, DataType* type,
   return Status::OK();
 }
 
-Status XlaOpKernelContext::allocate_output(int index, const xla::Shape& shape,
-                                           Tensor** output) {
-  // The step's default allocator is the dummy XlaCompilationAllocator which
-  // simply allocates a metadata buffer to hold the expression to which it
-  // corresponds.
-  if (expected_output_dtype(index) == DT_VARIANT) {
-    // tensor_data() is not supported for variant Tensor (i.e.,
-    // DataTypeCanUseMemcpy is false for DT_VARIANT), and so storing the
-    // XlaExpression inside the Tensor's tensor_data() does not work for
-    // variant. Instead construct a uint8 tensor and store the expression in its
-    // value.
-    // TODO(jpienaar): This should be refactored to stop masquerading
-    // XlaExpressions as Tensors.
-    *output = new Tensor();
-    TensorShape tensor_shape;
-    TF_RETURN_IF_ERROR(
-        context_->allocate_temp(DT_UINT8, tensor_shape, *output));
-    context_->set_output(index, **output);
-  } else {
-    TensorShape tensor_shape;
-    TF_RETURN_IF_ERROR(XLAShapeToTensorShape(shape, &tensor_shape));
-    TF_RETURN_IF_ERROR(context_->allocate_output(index, tensor_shape, output));
+void XlaOpKernelContext::SetOutputExpression(int index,
+                                             const XlaExpression& expression) {
+  Status status = [&] {
+    // The step's default allocator is the dummy XlaCompilationAllocator which
+    // simply allocates a metadata buffer to hold the expression to which it
+    // corresponds.
+    Tensor* output = nullptr;
+    // Provides a special behavior for DT_VARIANT: a variant is treated as
+    // DT_UINT8 scalar as the type to allow mapping for variant to more generic
+    // types.
+    if (expression.dtype() == DT_VARIANT) {
+      // tensor_data() is not supported for variant Tensor (i.e.,
+      // DataTypeCanUseMemcpy is false for DT_VARIANT), and so storing the
+      // XlaExpression inside the Tensor's tensor_data() does not work for
+      // variant. Instead construct a uint8 tensor and store the expression in
+      // its value.
+      // TODO(jpienaar): This should be refactored to stop masquerading
+      // XlaExpressions as Tensors.
+      output = new Tensor();
+      TensorShape tensor_shape;
+      TF_RETURN_IF_ERROR(
+          context_->allocate_temp(DT_UINT8, tensor_shape, output));
+      context_->set_output(index, *output);
+    } else {
+      TF_ASSIGN_OR_RETURN(TensorShape shape, expression.GetShape());
+      TF_RETURN_IF_ERROR(context_->allocate_output(index, shape, &output));
+    }
+    AssignExpressionToTensor(output, expression);
+    return Status::OK();
+  }();
+  if (!status.ok()) {
+    SetStatus(status);
   }
-  return Status::OK();
 }
 
 void XlaOpKernelContext::SetOutput(int index, const xla::XlaOp& handle) {
-  // Makes the host Tensor that will refer to the expression.
-  Tensor* output = nullptr;
-  auto shape_or = builder()->GetShape(handle);
-  if (!shape_or.ok()) {
-    SetStatus(shape_or.status());
-    return;
-  }
-
-  OP_REQUIRES_OK(context_,
-                 allocate_output(index, shape_or.ValueOrDie(), &output));
-
-  // The expression is stored in the tensor's data buffer. Fill in the
-  // fields now.
-  XlaExpression* expression = CastExpressionFromUninitializedTensor(output);
-  expression->set_handle(handle);
+  SetOutputExpression(
+      index,
+      XlaExpression::XlaOp(handle, context_->expected_output_dtype(index)));
 }
 
 void XlaOpKernelContext::SetConstantOutput(int index, const Tensor& constant) {
-  const TensorShape& shape = constant.shape();
-
-  xla::BorrowingLiteral literal;
-  OP_REQUIRES_OK(context_, HostTensorToBorrowingLiteral(constant, &literal));
-
-  xla::XlaOp handle = xla::ConstantLiteral(builder(), literal);
-  CHECK(handle.valid());
-
-  // Make the Tensor that will refer to the expression.
-  Tensor* output = nullptr;
-  // The step's default allocator is the dummy XlaCompilationAllocator which
-  // simply allocates a metadata buffer to hold the expression to which it
-  // corresponds.
-  OP_REQUIRES_OK(context_, context_->allocate_output(index, shape, &output));
-
-  // The expression is stored in the tensor's data buffer. Fill in the
-  // fields now.
-  XlaExpression* expression = CastExpressionFromUninitializedTensor(output);
-  expression->set_handle(handle);
-  expression->set_constant_value(constant);
-}
-
-void XlaOpKernelContext::SetInvalidOutput(int index) {
-  Tensor* output = nullptr;
-  OP_REQUIRES_OK(context_,
-                 context_->allocate_output(index, TensorShape({}), &output));
-  XlaExpression* expression = CastExpressionFromUninitializedTensor(output);
-  xla::XlaOp handle;
-  expression->set_handle(handle);
+  SetOutputExpression(index, XlaExpression::Constant(constant));
 }
 
 void XlaOpKernelContext::SetResourceOutput(int index, XlaResource* resource) {
-  Tensor* output = nullptr;
-  // The shape of the output tensor is the shape of the resource itself
-  // (i.e., a scalar), not the shape of the resource's value.
-  OP_REQUIRES_OK(context_,
-                 context_->allocate_output(index, TensorShape(), &output));
-  XlaExpression* expression = CastExpressionFromUninitializedTensor(output);
-  expression->set_resource(resource);
+  SetOutputExpression(index, XlaExpression::Resource(resource));
 }
 
 Status XlaOpKernelContext::GetResourceInput(int index, XlaResource** resource) {
@@ -570,10 +482,13 @@ Status AssignVariableTensor(const Tensor& tensor, DataType type,
   TF_RETURN_IF_ERROR(variable->SetTypeAndShape(type, shape));
 
   XlaContext& xla_context = XlaContext::Get(ctx);
-  TF_ASSIGN_OR_RETURN(TensorShape representation_shape,
+  TF_ASSIGN_OR_RETURN(xla::Shape representation_shape,
                       xla_context.RepresentationShape(shape, type));
-  if (shape != representation_shape) {
-    handle = xla::Reshape(handle, representation_shape.dim_sizes());
+  xla::Shape xla_shape;
+  TF_RETURN_IF_ERROR(TensorShapeToXLAShape(type, shape, &xla_shape));
+  if (!xla::ShapeUtil::Compatible(xla_shape, representation_shape)) {
+    handle = xla::Reshape(handle,
+                          xla::AsInt64Slice(representation_shape.dimensions()));
   }
   return variable->SetValue(handle);
 }
diff --git a/tensorflow/compiler/tf2xla/xla_op_kernel.h b/tensorflow/compiler/tf2xla/xla_op_kernel.h
index aa00a454968..c06efa2c474 100644
--- a/tensorflow/compiler/tf2xla/xla_op_kernel.h
+++ b/tensorflow/compiler/tf2xla/xla_op_kernel.h
@@ -88,9 +88,9 @@ class XlaOpKernelContext {
   // Returns input `index` as a XlaOp. Unlike
   // OpKernelContext::Input returns a symbolic value rather than a concrete
   // Tensor.
-  const xla::XlaOp& Input(int index);
+  xla::XlaOp Input(int index);
   // Returns input `name` as a XlaOp.
-  const xla::XlaOp& Input(absl::string_view name);
+  xla::XlaOp Input(absl::string_view name);
 
   // Returns true if all inputs are the same shape, otherwise sets the
   // status to a non-OK value and returns false.
@@ -111,14 +111,6 @@ class XlaOpKernelContext {
   Status ConstantInput(int index, xla::Literal* constant_literal);
   Status ConstantInput(absl::string_view name, xla::Literal* constant_literal);
 
-  // Evaluates input `index`, reshapes it to `new_shape` if new_shape !=
-  // InputShape(index), and stores it in `*constant_literal`. If the input
-  // cannot be evaluated, e.g., because it depends on unbound parameters,
-  // returns a non-Ok status. If InputShape(index).num_elements() !=
-  // new_shape.num_elements(), returns an error status.
-  Status ConstantInputReshaped(int index, absl::Span<const int64> new_dims,
-                               xla::Literal* constant_literal);
-
   // Converts a constant scalar int32 or int64 tensor into an int64.
   Status ConstantInputAsIntScalar(int index, int64* out);
   Status ConstantInputAsIntScalar(absl::string_view name, int64* out);
@@ -134,6 +126,8 @@ class XlaOpKernelContext {
   // Reshapes and converts a constant int32 or int64 tensor into a vector of
   // int64s.
   Status ConstantInputReshapedToIntVector(int index, std::vector<int64>* out);
+  Status ConstantInputReshapedToIntVector(absl::string_view name,
+                                          std::vector<int64>* out);
 
   // Converts a constant int32 or int64 Tensor into an xla int64 Literal.
   Status ConstantInputAsInt64Literal(int index, xla::Literal* out);
@@ -148,6 +142,10 @@ class XlaOpKernelContext {
   Status ConstantInputList(absl::string_view name,
                            std::vector<xla::Literal>* literals);
 
+  // Returns an XlaExpression describing the value of 'index'.
+  const XlaExpression& InputExpression(int index);
+  const XlaExpression& InputExpression(absl::string_view name);
+
   // Outputs
 
   int num_outputs() const { return context_->num_outputs(); }
@@ -165,9 +163,8 @@ class XlaOpKernelContext {
   // SetConstantOutput where possible.
   void SetConstantOutput(int index, const Tensor& host_tensor);
 
-  // Sets output `index` to an invalid value.
-  // Any subsequent attempt to consume this output will cause an error.
-  void SetInvalidOutput(int index);
+  // Returns an XlaExpression describing the value of 'index'.
+  void SetOutputExpression(int index, const XlaExpression& expression);
 
   // Status handling.
   void SetStatus(const Status& status) { context_->SetStatus(status); }
@@ -255,10 +252,13 @@ class XlaOpKernelContext {
   // Returns the tensor of input `name`.
   const Tensor& GetInputTensorByName(absl::string_view name);
 
-  // Wraps OpKernelContext's allocate_output method while providing special
-  // behavior for DT_VARIANT: a variant is treated as DT_UINT8 scalar as the
-  // type to allow mapping for variant to more generic types.
-  Status allocate_output(int index, const xla::Shape& shape, Tensor** output);
+  // Evaluates input `index`, reshapes it to `new_shape` if new_shape !=
+  // InputShape(index), and stores it in `*constant_literal`. If the input
+  // cannot be evaluated, e.g., because it depends on unbound parameters,
+  // returns a non-Ok status. If InputShape(index).num_elements() !=
+  // new_shape.num_elements(), returns an error status.
+  Status ConstantInputReshaped(int index, absl::Span<const int64> new_dims,
+                               xla::Literal* constant_literal);
 
   OpKernelContext* const context_;
 };
diff --git a/tensorflow/compiler/tf2xla/xla_op_registry.cc b/tensorflow/compiler/tf2xla/xla_op_registry.cc
index 9f00de708cc..dcd0e9c5c1f 100644
--- a/tensorflow/compiler/tf2xla/xla_op_registry.cc
+++ b/tensorflow/compiler/tf2xla/xla_op_registry.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <functional>
 #include <memory>
 
+#include "tensorflow/compiler/jit/legacy_flags/mark_for_compilation_pass_flags.h"
 #include "tensorflow/compiler/jit/xla_cluster_util.h"
 #include "tensorflow/compiler/tf2xla/type_util.h"
 #include "tensorflow/compiler/tf2xla/xla_context.h"
@@ -129,21 +130,27 @@ XlaOpRegistry::~XlaOpRegistry() = default;
   // Lazily register the CPU and GPU JIT devices the first time
   // GetCompilationDevice is called.
   static void* registration_init = [&registry]() {
+    legacy_flags::MarkForCompilationPassFlags* flags =
+        legacy_flags::GetMarkForCompilationPassFlags();
+    bool cpu_global_jit = flags->tf_xla_cpu_global_jit;
+
     mutex_lock lock(registry.mutex_);
     if (LaunchOpHasKernelForDevice(DeviceType(DEVICE_CPU)).ok()) {
       DeviceRegistration& registration =
           registry.compilation_devices_[DEVICE_CPU];
       registration.compilation_device_name = DEVICE_CPU_XLA_JIT;
-      registration.requires_compilation = false;
-      registration.enable_jit_by_default = false;
+      registration.autoclustering_policy =
+          cpu_global_jit
+              ? XlaOpRegistry::AutoclusteringPolicy::kIfEnabledGlobally
+              : XlaOpRegistry::AutoclusteringPolicy::kIfExplicitlyRequested;
       registration.compile_resource_ops = false;
     }
     if (LaunchOpHasKernelForDevice(DeviceType(DEVICE_GPU)).ok()) {
       DeviceRegistration& registration =
           registry.compilation_devices_[DEVICE_GPU];
       registration.compilation_device_name = DEVICE_GPU_XLA_JIT;
-      registration.requires_compilation = false;
-      registration.enable_jit_by_default = true;
+      registration.autoclustering_policy =
+          XlaOpRegistry::AutoclusteringPolicy::kIfEnabledGlobally;
       registration.compile_resource_ops = false;
     }
     return nullptr;
diff --git a/tensorflow/compiler/tf2xla/xla_op_registry.h b/tensorflow/compiler/tf2xla/xla_op_registry.h
index 45a40c0acc0..0bdd4a10854 100644
--- a/tensorflow/compiler/tf2xla/xla_op_registry.h
+++ b/tensorflow/compiler/tf2xla/xla_op_registry.h
@@ -66,19 +66,26 @@ class XlaOpRegistry {
  public:
   typedef OpKernel* (*Factory)(OpKernelConstruction*);
 
+  enum class AutoclusteringPolicy {
+    // Enable autoclustering if the user requests it, e.g., via
+    // experimental_jit_scope. Does not autocluster if the JIT is enabled
+    // globally (e.g., via the OptimizerOptions in the TF session
+    // configuration.)
+    kIfExplicitlyRequested,
+    // Enable autoclustering if explicitly requested, or if the JIT is enabled
+    // globally in the session options, or via TF_XLA_FLAGS=--tf_xla_auto_jit=N.
+    kIfEnabledGlobally,
+    // Always try to autocluster ops placed on this device.
+    kAlways,
+  };
+
   // Describes how to compile operators assigned to a device.
   struct DeviceRegistration {
     // The name of the an XLA compilation device to use to compile code.
     string compilation_device_name;
 
-    // Do operators assigned to this device require compilation?
-    bool requires_compilation;
-
-    // If !requires_compilation, should we try to JIT operators on this device
-    // when XLA JIT compilation is enabled globally via the SessionOptions?
-    // (It is still possible to explicitly mark operators to JIT compile, even
-    // if enable_jit_by_default is false.)
-    bool enable_jit_by_default;
+    // When should we autocluster operators assigned to this device?
+    AutoclusteringPolicy autoclustering_policy;
 
     // Enable compilation of operators that use DT_RESOURCE types?
     bool compile_resource_ops = false;
diff --git a/tensorflow/compiler/tf2xla/xla_resource.cc b/tensorflow/compiler/tf2xla/xla_resource.cc
index 63b09c8f02a..a322eb9015e 100644
--- a/tensorflow/compiler/tf2xla/xla_resource.cc
+++ b/tensorflow/compiler/tf2xla/xla_resource.cc
@@ -26,6 +26,19 @@ limitations under the License.
 
 namespace tensorflow {
 
+/*static*/ absl::string_view XlaResource::KindToString(XlaResource::Kind kind) {
+  switch (kind) {
+    case XlaResource::kInvalid:
+      return "invalid";
+    case XlaResource::kVariable:
+      return "variable";
+    case XlaResource::kStack:
+      return "stack";
+    case XlaResource::kTensorArray:
+      return "tensorarray";
+  }
+}
+
 XlaResource::XlaResource(Kind kind, int arg_num, string name, DataType type,
                          TensorShape shape, const xla::XlaOp& initial_value,
                          int64 tensor_array_size,
diff --git a/tensorflow/compiler/tf2xla/xla_resource.h b/tensorflow/compiler/tf2xla/xla_resource.h
index aa9ce1b171f..857b9a928bb 100644
--- a/tensorflow/compiler/tf2xla/xla_resource.h
+++ b/tensorflow/compiler/tf2xla/xla_resource.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <memory>
 
+#include "absl/strings/string_view.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/framework/tensor_shape.h"
@@ -35,6 +36,7 @@ class XlaResource {
     kTensorArray,
     kStack,
   };
+  static absl::string_view KindToString(Kind kind);
 
   XlaResource(Kind kind, int arg_num, string name, DataType type,
               TensorShape shape, const xla::XlaOp& initial_value,
diff --git a/tensorflow/compiler/xla/BUILD b/tensorflow/compiler/xla/BUILD
index d6b60c5f991..91096cf1d04 100644
--- a/tensorflow/compiler/xla/BUILD
+++ b/tensorflow/compiler/xla/BUILD
@@ -68,7 +68,7 @@ cc_library(
     visibility = [":friends"],
     deps = [
         ":xla_proto",
-        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
+        "//tensorflow/compiler/xla:debug_options_flags",
     ],
 )
 
@@ -735,6 +735,70 @@ tf_cc_test(
     ],
 )
 
+cc_library(
+    name = "parse_flags_from_env",
+    srcs = ["parse_flags_from_env.cc"],
+    hdrs = ["parse_flags_from_env.h"],
+    deps =
+        [
+            "//tensorflow/compiler/xla:types",
+            "//tensorflow/core:framework_internal",
+            "//tensorflow/core:lib",
+            "@com_google_absl//absl/strings",
+        ],
+)
+
+tf_cc_test(
+    name = "parse_flags_from_env_test",
+    srcs = ["parse_flags_from_env_test.cc"],
+    deps =
+        [
+            ":parse_flags_from_env",
+            "//tensorflow/compiler/xla:types",
+            "//tensorflow/core:framework_internal",
+            "//tensorflow/core:lib",
+            "//tensorflow/core:test",
+            "@com_google_absl//absl/strings:str_format",
+        ],
+)
+
+cc_library(
+    name = "debug_options_flags",
+    srcs = [
+        "debug_options_flags.cc",
+        "debug_options_parsers.h",
+    ],
+    hdrs = ["debug_options_flags.h"],
+    deps =
+        [
+            ":parse_flags_from_env",
+            "//tensorflow/compiler/xla:xla_proto",
+            "//tensorflow/compiler/xla/service:hlo",
+            "//tensorflow/core:framework_internal",
+            "//tensorflow/core:lib",
+            "@com_google_absl//absl/strings",
+        ],
+)
+
+tf_cc_test(
+    name = "debug_options_parsers_test",
+    size = "small",
+    srcs = [
+        "debug_options_parsers.h",
+        "debug_options_parsers_test.cc",
+    ],
+    deps =
+        [
+            "//tensorflow/compiler/xla:xla_proto",
+            "//tensorflow/compiler/xla/service:hlo",
+            "//tensorflow/core:framework_internal",
+            "//tensorflow/core:lib",
+            "//tensorflow/core:test",
+            "@com_google_absl//absl/strings",
+            "@com_google_absl//absl/strings:str_format",
+        ],
+)
+
 # -----------------------------------------------------------------------------
 
 # This is a headers target that extra XLA devices can use to prevent circular dependencies.  Devices that are compiled as separate shared objects can also use it to prevent linking of library code.
diff --git a/tensorflow/compiler/xla/client/BUILD b/tensorflow/compiler/xla/client/BUILD
index 0cbe68d7efd..42da0ebf499 100644
--- a/tensorflow/compiler/xla/client/BUILD
+++ b/tensorflow/compiler/xla/client/BUILD
@@ -68,6 +68,7 @@ cc_library(
     deps = [
         ":global_data",
         ":xla_computation",
+        "//tensorflow/compiler/xla:debug_options_flags",
         "//tensorflow/compiler/xla:execution_options_util",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:service_interface",
@@ -76,7 +77,6 @@ cc_library(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla:xla_proto",
-        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/compiler/xla/service:hlo_proto",
         "//tensorflow/core:lib",
         "@com_google_absl//absl/memory",
@@ -236,13 +236,13 @@ tf_cc_test(
     deps = [
         ":xla_builder",
         ":xla_computation",
+        "//tensorflow/compiler/xla:debug_options_flags",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_matchers",
         "//tensorflow/core:test",
diff --git a/tensorflow/compiler/xla/client/client.cc b/tensorflow/compiler/xla/client/client.cc
index f5f8d5c6b1f..eef2844e0df 100644
--- a/tensorflow/compiler/xla/client/client.cc
+++ b/tensorflow/compiler/xla/client/client.cc
@@ -21,8 +21,8 @@ limitations under the License.
 #include "absl/memory/memory.h"
 #include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/xla/client/xla_computation.h"
+#include "tensorflow/compiler/xla/debug_options_flags.h"
 #include "tensorflow/compiler/xla/execution_options_util.h"
-#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/types.h"
@@ -210,11 +210,10 @@ StatusOr<XlaComputation> Client::LoadSnapshot(const HloSnapshot& module) {
   return XlaComputation(module.hlo().hlo_module());
 }
 
-StatusOr<std::unique_ptr<GlobalData>> Client::Execute(
-    const XlaComputation& computation, absl::Span<GlobalData* const> arguments,
-    const ExecutionOptions* execution_options,
-    ExecutionProfile* execution_profile) {
-  ExecuteGraphRequest request;
+StatusOr<ExecutionHandle> Client::Compile(
+    const XlaComputation& computation, absl::Span<const Shape> argument_shapes,
+    const ExecutionOptions* execution_options) {
+  CompileRequest request;
   *request.mutable_computation() = computation.proto();
 
   if (execution_options == nullptr) {
@@ -222,6 +221,34 @@ StatusOr<std::unique_ptr<GlobalData>> Client::Execute(
   } else {
     *request.mutable_execution_options() = *execution_options;
   }
+  if (request.execution_options().device_handles_size() > 1) {
+    return InvalidArgument(
+        "Compiling with multiple device handles is not supported. Use "
+        "'Execute' instead.");
+  }
+
+  // The argument shapes affect how the computation is compiled.
+  for (const auto& arg_shape : argument_shapes) {
+    *request.add_input_shape_with_layout() = arg_shape;
+  }
+
+  CompileResponse response;
+  VLOG(1) << "making compile request: " << request.ShortDebugString();
+  Status s = stub_->Compile(&request, &response);
+  VLOG(1) << "done with request";
+
+  if (!s.ok()) {
+    return s;
+  }
+  TF_RET_CHECK(response.has_handle());
+  return response.handle();
+}
+
+StatusOr<std::unique_ptr<GlobalData>> Client::Execute(
+    const ExecutionHandle& handle, absl::Span<GlobalData* const> arguments,
+    ExecutionProfile* execution_profile) {
+  ExecuteRequest request;
+  *request.mutable_handle() = handle;
   for (GlobalData* argument : arguments) {
     CHECK(argument != nullptr) << "Argument pointers must not be null.";
     *request.add_arguments() = argument->handle();
@@ -229,7 +256,7 @@ StatusOr<std::unique_ptr<GlobalData>> Client::Execute(
 
   ExecuteResponse response;
   VLOG(1) << "making execute request: " << request.ShortDebugString();
-  Status s = stub_->ExecuteGraph(&request, &response);
+  Status s = stub_->Execute(&request, &response);
   VLOG(1) << "done with request";
 
   if (!s.ok()) {
@@ -238,15 +265,62 @@ StatusOr<std::unique_ptr<GlobalData>> Client::Execute(
 
   if (execution_profile != nullptr) {
     *execution_profile = response.profile();
+  }
+
+  return absl::make_unique<GlobalData>(stub_, response.output());
+}
+
+StatusOr<std::unique_ptr<GlobalData>> Client::Execute(
+    const XlaComputation& computation, absl::Span<GlobalData* const> arguments,
+    const ExecutionOptions* execution_options,
+    ExecutionProfile* execution_profile) {
+  if (execution_options != nullptr &&
+      execution_options->device_handles_size() > 1) {
+    std::vector<XlaComputationInstance> computation_instances = {
+        XlaComputationInstance{
+            computation,
+            std::vector<GlobalData*>(arguments.begin(), arguments.end()),
+            *execution_options, execution_profile}};
+    TF_ASSIGN_OR_RETURN(auto results, ExecuteParallel(computation_instances));
+    // The result selection is a bit hacky, but better than assuming it is
+    // device 0.
+    //
+    // TODO(b/118493728): Allow Execute to return one result per computation.
+    for (int64 i = 0; i < results.size(); i++) {
+      TF_ASSIGN_OR_RETURN(const Shape& shape, GetShape(*results[i]));
+      if (!ShapeUtil::IsEmptyTuple(shape)) {
+        VLOG(3) << "Fetching result from device " << i << ": "
+                << ShapeUtil::HumanString(shape);
+        return std::move(results[i]);
+      }
+    }
+    TF_RET_CHECK(!results.empty());
+    VLOG(1) << "Defaulting to device 0 result";
+    return std::move(results[0]);
+  }
+
+  // The argument shapes affect how the computation is compiled.
+  std::vector<Shape> arg_shapes(arguments.size());
+  for (int i = 0; i < arguments.size(); i++) {
+    TF_ASSIGN_OR_RETURN(arg_shapes[i], GetShape(*arguments[i]));
+  }
+
+  TF_ASSIGN_OR_RETURN(auto handle,
+                      Compile(computation, arg_shapes, execution_options));
+
+  TF_ASSIGN_OR_RETURN(auto result,
+                      Execute(handle, arguments, execution_profile));
+
+  if (execution_profile != nullptr) {
     if (VLOG_IS_ON(1)) {
       TF_ASSIGN_OR_RETURN(
           auto execution_stats,
-          ExecutionStatsAsString(computation, response.profile()));
+          ExecutionStatsAsString(computation, *execution_profile));
       VLOG(1) << execution_stats;
     }
   }
 
-  return absl::make_unique<GlobalData>(stub_, response.output());
+  return std::move(result);
 }
 
 StatusOr<std::vector<std::unique_ptr<GlobalData>>> Client::ExecuteParallel(
@@ -274,10 +348,11 @@ StatusOr<std::vector<std::unique_ptr<GlobalData>>> Client::ExecuteParallel(
   }
 
   std::vector<std::unique_ptr<GlobalData>> outputs;
-  for (size_t i = 0; i < computations.size(); ++i) {
+  for (size_t i = 0; i < response.responses_size(); ++i) {
     outputs.push_back(
         absl::make_unique<GlobalData>(stub_, response.responses(i).output()));
-    if (computations[i].execution_profile != nullptr) {
+    if (i < computations.size() &&
+        computations[i].execution_profile != nullptr) {
       *computations[i].execution_profile = response.responses(i).profile();
     }
   }
@@ -390,8 +465,7 @@ StatusOr<string> Client::ExecutionStatsAsString(
     const XlaComputation& computation, const ExecutionProfile& profile) {
   TF_ASSIGN_OR_RETURN(
       auto computation_stats,
-      GetComputationStats(computation,
-                          legacy_flags::GetDebugOptionsFromFlags()));
+      GetComputationStats(computation, GetDebugOptionsFromFlags()));
   int64 total_flops =
       computation_stats.flop_count() + computation_stats.transcendental_count();
   if (profile.compute_time_ns() > 0) {
diff --git a/tensorflow/compiler/xla/client/client.h b/tensorflow/compiler/xla/client/client.h
index 6f4d33c469f..d0ac4703c63 100644
--- a/tensorflow/compiler/xla/client/client.h
+++ b/tensorflow/compiler/xla/client/client.h
@@ -40,6 +40,31 @@ class Client {
   explicit Client(ServiceInterface* stub);
   virtual ~Client();
 
+  // Compile the computation with the given argument shapes and returns the
+  // handle to the compiled executable. The compiled executable is cached on the
+  // service, and the returned handle can be used for exection without
+  // re-compile.
+  // * The shape and layout of the arguments being executed with will affect how
+  //   the computation is compiled. If argument_shapes is empty, the parameters'
+  //   shape and layout will be used in the compilation.
+  // * If execution_options is not nullptr, these options are passed to the
+  //   service to affect how it compiles our computation.  (The pointer does not
+  //   need to live beyond this call.)
+  // * If execution_options.device_handles should be empty. If you need
+  //   non-empty device handles, call 'Execute' instead.
+  StatusOr<ExecutionHandle> Compile(
+      const XlaComputation& computation,
+      absl::Span<const Shape> argument_shapes,
+      const ExecutionOptions* execution_options = nullptr);
+
+  // Executes the compiled executable for the given handle with the given
+  // arguments and returns the global data that was produced from the execution.
+  // * If execution_profile is not nullptr then the pointed-to ExecutionProfile
+  //   will be filled with profile data from the execution.
+  StatusOr<std::unique_ptr<GlobalData>> Execute(
+      const ExecutionHandle& handle, absl::Span<GlobalData* const> arguments,
+      ExecutionProfile* execution_profile = nullptr);
+
   // Executes the computation with the given arguments and returns the global
   // data that was produced from the execution.
   // * If execution_options is not nullptr, these options are passed to the
diff --git a/tensorflow/compiler/xla/client/lib/math.cc b/tensorflow/compiler/xla/client/lib/math.cc
index d3d7edb42a3..08a887a6e46 100644
--- a/tensorflow/compiler/xla/client/lib/math.cc
+++ b/tensorflow/compiler/xla/client/lib/math.cc
@@ -265,6 +265,22 @@ XlaOp Digamma(XlaOp input) {
   return result;
 }
 
+// Implements Banker's rounding: numbers that are equidistant between two
+// integers are rounded towards even.
+XlaOp RoundToEven(XlaOp x) {
+  auto half = xla::ScalarLike(x, 0.5);
+  auto one = xla::ScalarLike(x, 1.0);
+  auto two = xla::ScalarLike(x, 2.0);
+
+  auto round_val = xla::Floor(x);
+  auto fraction = x - round_val;
+  auto nearest_even_int = round_val - two * xla::Floor(half * x);
+  auto is_odd = xla::Eq(nearest_even_int, one);
+  return xla::Select(xla::Or(xla::Gt(fraction, half),
+                             xla::And(xla::Eq(fraction, half), is_odd)),
+                     round_val + one, round_val);
+}
+
 // Trigonometric functions.
 
 // acos(x) = 2 * atan(sqrt(1 - x^2) / (1 + x))
diff --git a/tensorflow/compiler/xla/client/lib/math.h b/tensorflow/compiler/xla/client/lib/math.h
index a6cafd42077..3f06d04b9ae 100644
--- a/tensorflow/compiler/xla/client/lib/math.h
+++ b/tensorflow/compiler/xla/client/lib/math.h
@@ -51,6 +51,10 @@ XlaOp Lgamma(XlaOp input);
 // Computes an approximation of the digamma function.
 XlaOp Digamma(XlaOp input);
 
+// Rounds the given number to even when the number is equidistant between two
+// integers.
+XlaOp RoundToEven(XlaOp x);
+
 // Trigonometric functions
 
 // Computes the arc cosine of 'x'.
diff --git a/tensorflow/compiler/xla/client/lib/math_test.cc b/tensorflow/compiler/xla/client/lib/math_test.cc
index 14c259a7fa2..ae2ea225d1a 100644
--- a/tensorflow/compiler/xla/client/lib/math_test.cc
+++ b/tensorflow/compiler/xla/client/lib/math_test.cc
@@ -136,5 +136,17 @@ XLA_TEST_F(MathTest, Digamma) {
   ComputeAndCompareR1<float>(&builder, expected, {}, error_spec_);
 }
 
+XLA_TEST_F(MathTest, RoundToEven) {
+  XlaBuilder builder(TestName());
+  auto x = ConstantR1<float>(
+      &builder, {-1.4, -1.5, -2.5, -0.5, 0, 0.5, 1.5, 2.5, 3.5, 4.5});
+  RoundToEven(x);
+
+  std::vector<float> expected = {-1.0, -2.0, -2.0, -0.0, 0,
+                                 0.0,  2.0,  2.0,  4.0,  4.0};
+
+  ComputeAndCompareR1<float>(&builder, expected, {}, error_spec_);
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/client/local_client.h b/tensorflow/compiler/xla/client/local_client.h
index feb2f8ec9da..e49451ca970 100644
--- a/tensorflow/compiler/xla/client/local_client.h
+++ b/tensorflow/compiler/xla/client/local_client.h
@@ -60,8 +60,8 @@ class LocalExecutable {
   // Validates that the given arguments and options satisfy various constraints
   // of the computation.
   //
-  // The given ExecutableRunOptions override any values from legacy_flags
-  // (TF_XLA_FLAGS environment variable).
+  // The given ExecutableRunOptions override any values from TF_XLA_FLAGS
+  // environment variable.
   Status ValidateExecutionOptions(
       const absl::Span<const ShapedBuffer* const> arguments,
       const ExecutableRunOptions& run_options, const Backend& backend);
@@ -69,8 +69,8 @@ class LocalExecutable {
   // Records the computation in a SessionModule proto with the arguments used to
   // invoke it, and the result. Enabled by flag: --tla_dump_executions_to.
   //
-  // The given ServiceExecutableRunOptions override any values from legacy_flags
-  // (TF_XLA_FLAGS environment variable).
+  // The given ServiceExecutableRunOptions override any values from TF_XLA_FLAGS
+  // environment variable.
   StatusOr<ScopedShapedBuffer> ExecuteAndDump(
       const ServiceExecutableRunOptions* run_options,
       const absl::Span<const ShapedBuffer* const> arguments);
@@ -114,8 +114,8 @@ class LocalClient : public Client {
   // Build and return a LocalExecutable object. The executable is compiled using
   // the given XlaComputation, argument layouts and options.
   //
-  // The given ExecutableBuildOptions override any values from legacy_flags
-  // (TF_XLA_FLAGS environment variable).
+  // The given ExecutableBuildOptions override any values from TF_XLA_FLAGS
+  // environment variable.
   StatusOr<std::unique_ptr<LocalExecutable>> Compile(
       const XlaComputation& computation,
       const absl::Span<const Shape* const> argument_layouts,
diff --git a/tensorflow/compiler/xla/client/xla_builder.cc b/tensorflow/compiler/xla/client/xla_builder.cc
index f9c23b44810..0a587725d20 100644
--- a/tensorflow/compiler/xla/client/xla_builder.cc
+++ b/tensorflow/compiler/xla/client/xla_builder.cc
@@ -2305,6 +2305,19 @@ XlaOp XlaBuilder::RecvFromHost(const XlaOp& token, const Shape& shape,
   });
 }
 
+XlaOp XlaBuilder::GetDimensionSize(const XlaOp& operand, int64 dimension) {
+  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+    TF_ASSIGN_OR_RETURN(const auto& operand_shape, GetShape(operand));
+    TF_ASSIGN_OR_RETURN(
+        *instr.mutable_shape(),
+        ShapeInference::InferGetDimensionSizeShape(operand_shape, dimension));
+    instr.add_dimensions(dimension);
+    return AddInstruction(std::move(instr), HloOpcode::kGetDimensionSize,
+                          {operand});
+  });
+}
+
 StatusOr<bool> XlaBuilder::IsConstant(const XlaOp& operand) const {
   TF_RETURN_IF_ERROR(first_error_);
 
@@ -3158,4 +3171,8 @@ XlaOp Iota(XlaBuilder* builder, const Shape& shape, int64 iota_dimension) {
   return builder->Iota(shape, iota_dimension);
 }
 
+XlaOp GetDimensionSize(const XlaOp& operand, int64 dimension) {
+  return operand.builder()->GetDimensionSize(operand, dimension);
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/client/xla_builder.h b/tensorflow/compiler/xla/client/xla_builder.h
index 908a616b4ea..68314a026ea 100644
--- a/tensorflow/compiler/xla/client/xla_builder.h
+++ b/tensorflow/compiler/xla/client/xla_builder.h
@@ -933,6 +933,8 @@ class XlaBuilder {
                       const XlaOp& grad_output, float epsilon,
                       int64 feature_index);
 
+  XlaOp GetDimensionSize(const XlaOp& operand, int64 dimension);
+
   StatusOr<XlaOp> AddInstruction(HloInstructionProto&& instr, HloOpcode opcode,
                                  absl::Span<const XlaOp> operands = {});
 
@@ -1355,6 +1357,8 @@ class XlaBuilder {
                                 const string& outfeed_config);
   friend XlaOp CreateToken(XlaBuilder* builder);
   friend XlaOp AfterAll(XlaBuilder* builder, absl::Span<const XlaOp> tokens);
+
+  friend XlaOp GetDimensionSize(const XlaOp& operand, int64 dimension);
 };
 
 // RAII-style object: sets the current sharding assignment in builder on
@@ -2129,6 +2133,10 @@ XlaOp BatchNormGrad(const XlaOp& operand, const XlaOp& scale,
                     const XlaOp& grad_output, float epsilon,
                     int64 feature_index);
 
+// Returns the size of the given dimension of the operand. The operand must be
+// array shaped.
+XlaOp GetDimensionSize(const XlaOp& operand, int64 dimension);
+
 // Implementation details below this point.
 
 template <typename NativeT>
diff --git a/tensorflow/compiler/xla/client/xla_builder_test.cc b/tensorflow/compiler/xla/client/xla_builder_test.cc
index dfe5fd5eb23..8aa85c3cd63 100644
--- a/tensorflow/compiler/xla/client/xla_builder_test.cc
+++ b/tensorflow/compiler/xla/client/xla_builder_test.cc
@@ -18,7 +18,7 @@ limitations under the License.
 #include <string>
 
 #include "tensorflow/compiler/xla/client/xla_computation.h"
-#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
+#include "tensorflow/compiler/xla/debug_options_flags.h"
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -43,7 +43,7 @@ class XlaBuilderTest : public ::testing::Test {
     const HloModuleProto& proto = computation.proto();
     TF_ASSIGN_OR_RETURN(const auto& config,
                         HloModule::CreateModuleConfigFromProto(
-                            proto, legacy_flags::GetDebugOptionsFromFlags()));
+                            proto, GetDebugOptionsFromFlags()));
     return HloModule::CreateFromProto(proto, config);
   }
 
@@ -54,7 +54,7 @@ class XlaBuilderTest : public ::testing::Test {
     const HloModuleProto& proto = computation.proto();
     TF_ASSIGN_OR_RETURN(const auto& config,
                         HloModule::CreateModuleConfigFromProto(
-                            proto, legacy_flags::GetDebugOptionsFromFlags()));
+                            proto, GetDebugOptionsFromFlags()));
     return HloModule::CreateFromProto(proto, config);
   }
 
@@ -349,6 +349,15 @@ TEST_F(XlaBuilderTest, CollectivePermute) {
   EXPECT_EQ(root->opcode(), HloOpcode::kCollectivePermute);
 }
 
+TEST_F(XlaBuilderTest, GetDimensionSize) {
+  XlaBuilder b(TestName());
+  auto x = Parameter(&b, 0, ShapeUtil::MakeShape(F32, {5, 7}), "x");
+  GetDimensionSize(x, 1);
+  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b));
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_EQ(root->opcode(), HloOpcode::kGetDimensionSize);
+}
+
 TEST_F(XlaBuilderTest, ReportError) {
   XlaBuilder b(TestName());
   auto x = Parameter(&b, 0, ShapeUtil::MakeShape(F32, {5, 7}), "x");
diff --git a/tensorflow/compiler/xla/legacy_flags/debug_options_flags.cc b/tensorflow/compiler/xla/debug_options_flags.cc
similarity index 96%
rename from tensorflow/compiler/xla/legacy_flags/debug_options_flags.cc
rename to tensorflow/compiler/xla/debug_options_flags.cc
index 3ed3afcfced..033887d7c11 100644
--- a/tensorflow/compiler/xla/legacy_flags/debug_options_flags.cc
+++ b/tensorflow/compiler/xla/debug_options_flags.cc
@@ -13,17 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
+#include "tensorflow/compiler/xla/debug_options_flags.h"
 
 #include <mutex>  // NOLINT(build/c++11): only using std::call_once, not mutex.
 #include <vector>
 #include "absl/strings/str_split.h"
-#include "tensorflow/compiler/xla/legacy_flags/debug_options_parsers.h"
-#include "tensorflow/compiler/xla/legacy_flags/parse_flags_from_env.h"
+#include "tensorflow/compiler/xla/debug_options_parsers.h"
+#include "tensorflow/compiler/xla/parse_flags_from_env.h"
 
 namespace xla {
-namespace legacy_flags {
-
 namespace {
 
 DebugOptions* flag_values;
@@ -101,8 +99,8 @@ void AllocateFlags() {
       [](string comma_separated_values) {
         auto* extra_options_map =
             flag_values->mutable_xla_backend_extra_options();
-        impl::parse_xla_backend_extra_options(extra_options_map,
-                                              comma_separated_values);
+        parse_xla_backend_extra_options(extra_options_map,
+                                        comma_separated_values);
         return true;
       };
 
@@ -111,8 +109,8 @@ void AllocateFlags() {
       [](string reduce_precision_option_value) {
         HloReducePrecisionOptions* option_proto =
             flag_values->add_hlo_reduce_precision_options();
-        return impl::parse_xla_reduce_precision_option(
-            option_proto, reduce_precision_option_value);
+        return parse_xla_reduce_precision_option(option_proto,
+                                                 reduce_precision_option_value);
       };
 
   flag_objects = new std::vector<tensorflow::Flag>({
@@ -353,5 +351,4 @@ xla::DebugOptions GetDebugOptionsFromFlags() {
   return *flag_values;
 }
 
-}  // namespace legacy_flags
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/legacy_flags/debug_options_flags.h b/tensorflow/compiler/xla/debug_options_flags.h
similarity index 81%
rename from tensorflow/compiler/xla/legacy_flags/debug_options_flags.h
rename to tensorflow/compiler/xla/debug_options_flags.h
index b53157f59c6..60e59abc2a2 100644
--- a/tensorflow/compiler/xla/legacy_flags/debug_options_flags.h
+++ b/tensorflow/compiler/xla/debug_options_flags.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMPILER_XLA_LEGACY_FLAGS_DEBUG_OPTIONS_FLAGS_H_
-#define TENSORFLOW_COMPILER_XLA_LEGACY_FLAGS_DEBUG_OPTIONS_FLAGS_H_
+#ifndef TENSORFLOW_COMPILER_XLA_DEBUG_OPTIONS_FLAGS_H_
+#define TENSORFLOW_COMPILER_XLA_DEBUG_OPTIONS_FLAGS_H_
 
 #include <vector>
 
@@ -22,7 +22,6 @@ limitations under the License.
 #include "tensorflow/core/util/command_line_flags.h"
 
 namespace xla {
-namespace legacy_flags {
 
 // Appends flag definitions for debug options to flag_list.
 void AppendDebugOptionsFlags(std::vector<tensorflow::Flag>* flag_list);
@@ -32,7 +31,6 @@ void AppendDebugOptionsFlags(std::vector<tensorflow::Flag>* flag_list);
 // first.
 xla::DebugOptions GetDebugOptionsFromFlags();
 
-}  // namespace legacy_flags
 }  // namespace xla
 
-#endif  // TENSORFLOW_COMPILER_XLA_LEGACY_FLAGS_DEBUG_OPTIONS_FLAGS_H_
+#endif  // TENSORFLOW_COMPILER_XLA_DEBUG_OPTIONS_FLAGS_H_
diff --git a/tensorflow/compiler/xla/legacy_flags/debug_options_parsers.h b/tensorflow/compiler/xla/debug_options_parsers.h
similarity index 94%
rename from tensorflow/compiler/xla/legacy_flags/debug_options_parsers.h
rename to tensorflow/compiler/xla/debug_options_parsers.h
index ee7eb019c07..80aadfd5ece 100644
--- a/tensorflow/compiler/xla/legacy_flags/debug_options_parsers.h
+++ b/tensorflow/compiler/xla/debug_options_parsers.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMPILER_XLA_LEGACY_FLAGS_DEBUG_OPTIONS_PARSERS_H_
-#define TENSORFLOW_COMPILER_XLA_LEGACY_FLAGS_DEBUG_OPTIONS_PARSERS_H_
+#ifndef TENSORFLOW_COMPILER_XLA_DEBUG_OPTIONS_PARSERS_H_
+#define TENSORFLOW_COMPILER_XLA_DEBUG_OPTIONS_PARSERS_H_
 
 #include <vector>
 #include "absl/strings/numbers.h"
@@ -23,8 +23,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/xla.pb.h"
 
 namespace xla {
-namespace legacy_flags {
-namespace impl {
 
 template <typename T>
 void parse_xla_backend_extra_options(T* extra_options_map,
@@ -140,8 +138,6 @@ inline bool parse_xla_reduce_precision_option(
   return true;
 }
 
-}  // namespace impl
-}  // namespace legacy_flags
 }  // namespace xla
 
-#endif  // TENSORFLOW_COMPILER_XLA_LEGACY_FLAGS_DEBUG_OPTIONS_PARSERS_H_
+#endif  // TENSORFLOW_COMPILER_XLA_DEBUG_OPTIONS_PARSERS_H_
diff --git a/tensorflow/compiler/xla/legacy_flags/debug_options_parsers_test.cc b/tensorflow/compiler/xla/debug_options_parsers_test.cc
similarity index 88%
rename from tensorflow/compiler/xla/legacy_flags/debug_options_parsers_test.cc
rename to tensorflow/compiler/xla/debug_options_parsers_test.cc
index 6f197aec53c..8003c3496d5 100644
--- a/tensorflow/compiler/xla/legacy_flags/debug_options_parsers_test.cc
+++ b/tensorflow/compiler/xla/debug_options_parsers_test.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 // Test for parse_flags_from_env.cc
 
-#include "tensorflow/compiler/xla/legacy_flags/debug_options_parsers.h"
+#include "tensorflow/compiler/xla/debug_options_parsers.h"
 
 #include <unordered_map>
 #include <vector>
@@ -23,13 +23,12 @@ limitations under the License.
 #include "tensorflow/core/platform/test.h"
 
 namespace xla {
-namespace legacy_flags {
 
 // Test that the xla_backend_extra_options flag is parsed correctly.
 TEST(DebugOptionsFlags, ParseXlaBackendExtraOptions) {
   std::unordered_map<string, string> test_map;
   string test_string = "aa=bb,cc,dd=,ee=ff=gg";
-  impl::parse_xla_backend_extra_options(&test_map, test_string);
+  parse_xla_backend_extra_options(&test_map, test_string);
   EXPECT_EQ(test_map.size(), 4);
   EXPECT_EQ(test_map.at("aa"), "bb");
   EXPECT_EQ(test_map.at("cc"), "");
@@ -41,7 +40,7 @@ TEST(DebugOptionsFlags, ParseXlaBackendExtraOptions) {
 TEST(DebugOptionsFlags, ParseXlaReducePrecisionOptionNoStrings) {
   HloReducePrecisionOptions proto;
   string test_string = "OP_OUTPUTS=5,10:add,dot";
-  EXPECT_TRUE(impl::parse_xla_reduce_precision_option(&proto, test_string));
+  EXPECT_TRUE(parse_xla_reduce_precision_option(&proto, test_string));
   EXPECT_EQ(proto.location(), HloReducePrecisionOptions::OP_OUTPUTS);
   EXPECT_EQ(proto.exponent_bits(), 5);
   EXPECT_EQ(proto.mantissa_bits(), 10);
@@ -56,7 +55,7 @@ TEST(DebugOptionsFlags, ParseXlaReducePrecisionOptionNoStrings) {
 TEST(DebugOptionsFlags, ParseXlaReducePrecisionOptionNoStringsSemicolon) {
   HloReducePrecisionOptions proto;
   string test_string = "OP_OUTPUTS=5,10:add,dot;";
-  EXPECT_TRUE(impl::parse_xla_reduce_precision_option(&proto, test_string));
+  EXPECT_TRUE(parse_xla_reduce_precision_option(&proto, test_string));
   EXPECT_EQ(proto.location(), HloReducePrecisionOptions::OP_OUTPUTS);
   EXPECT_EQ(proto.exponent_bits(), 5);
   EXPECT_EQ(proto.mantissa_bits(), 10);
@@ -71,7 +70,7 @@ TEST(DebugOptionsFlags, ParseXlaReducePrecisionOptionNoStringsSemicolon) {
 TEST(DebugOptionsFlags, ParseXlaReducePrecisionOptionNoOpcodes) {
   HloReducePrecisionOptions proto;
   string test_string = "UNFUSED_OP_OUTPUTS=5,10:;foo,bar/baz";
-  EXPECT_TRUE(impl::parse_xla_reduce_precision_option(&proto, test_string));
+  EXPECT_TRUE(parse_xla_reduce_precision_option(&proto, test_string));
   EXPECT_EQ(proto.location(), HloReducePrecisionOptions::UNFUSED_OP_OUTPUTS);
   EXPECT_EQ(proto.exponent_bits(), 5);
   EXPECT_EQ(proto.mantissa_bits(), 10);
@@ -84,7 +83,7 @@ TEST(DebugOptionsFlags, ParseXlaReducePrecisionOptionNoOpcodes) {
 TEST(DebugOptionsFlags, ParseXlaReducePrecisionOptionBoth) {
   HloReducePrecisionOptions proto;
   string test_string = "UNFUSED_OP_OUTPUTS=5,10:subtract;foo,bar/baz";
-  EXPECT_TRUE(impl::parse_xla_reduce_precision_option(&proto, test_string));
+  EXPECT_TRUE(parse_xla_reduce_precision_option(&proto, test_string));
   EXPECT_EQ(proto.location(), HloReducePrecisionOptions::UNFUSED_OP_OUTPUTS);
   EXPECT_EQ(proto.exponent_bits(), 5);
   EXPECT_EQ(proto.mantissa_bits(), 10);
@@ -96,7 +95,6 @@ TEST(DebugOptionsFlags, ParseXlaReducePrecisionOptionBoth) {
   EXPECT_EQ(proto.opname_substrings_to_suffix(1), "bar/baz");
 }
 
-}  // namespace legacy_flags
 }  // namespace xla
 
 int main(int argc, char* argv[]) {
diff --git a/tensorflow/compiler/xla/execution_options_util.cc b/tensorflow/compiler/xla/execution_options_util.cc
index e83ff7cddd6..cf569863bbe 100644
--- a/tensorflow/compiler/xla/execution_options_util.cc
+++ b/tensorflow/compiler/xla/execution_options_util.cc
@@ -13,14 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/compiler/xla/execution_options_util.h"
-#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
+#include "tensorflow/compiler/xla/debug_options_flags.h"
 
 namespace xla {
 
 ExecutionOptions CreateDefaultExecutionOptions() {
   ExecutionOptions execution_options;
-  *(execution_options.mutable_debug_options()) =
-      legacy_flags::GetDebugOptionsFromFlags();
+  *(execution_options.mutable_debug_options()) = GetDebugOptionsFromFlags();
   return execution_options;
 }
 
diff --git a/tensorflow/compiler/xla/g3doc/jit.md b/tensorflow/compiler/xla/g3doc/jit.md
index 5376a04669d..ded1e582b24 100644
--- a/tensorflow/compiler/xla/g3doc/jit.md
+++ b/tensorflow/compiler/xla/g3doc/jit.md
@@ -58,7 +58,7 @@ sess = tf.Session(config=config)
 > compiled for the CPU. JIT compilation for CPU operations must be done via
 > the manual method documented below.
 
-#### Manual
+#### Manual with experimental_jit_scope()
 
 JIT compilation can also be turned on manually for one or more operators. This
 is done by tagging the operators to compile with the attribute
@@ -79,6 +79,16 @@ The `_XlaCompile` attribute is currently supported on a best-effort basis. If an
 operator cannot be compiled, TensorFlow will silently fall back to the normal
 implementation.
 
+#### Manual with xla.compile()
+
+Unlike experimental_jit_scope() which silently falls back to normal Tensorflow
+on uncompilable operator, xla.compile() returns an explicit error. This is
+useful if you want more predictable behaviors from XLA compilation.
+
+Please see
+[xla.compile() tutorial Colab](https://colab.sandbox.google.com/github/tensorflow/compiler/xla/g3doc/tutorials/xla_compile.ipynb)
+for how to use it.
+
 ### Placing operators on XLA devices
 
 Another way to run computations via XLA is to place an operator on a specific
diff --git a/tensorflow/compiler/xla/g3doc/operation_semantics.md b/tensorflow/compiler/xla/g3doc/operation_semantics.md
index a3cdfe19b2e..73a9db75f6b 100644
--- a/tensorflow/compiler/xla/g3doc/operation_semantics.md
+++ b/tensorflow/compiler/xla/g3doc/operation_semantics.md
@@ -1339,6 +1339,22 @@ the semantics for `tf.gather_nd`.
 index `X` in the gather indices array picks an entire row and the result is the
 concatenation of all these rows.
 
+## GetDimensionSize
+
+See also
+[`XlaBuilder::GetDimensionSize`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
+
+Returns the size of the given dimension of the operand. The operand must be
+array shaped.
+
+<b> `GetDimensionSize(operand, dimension)` </b>
+
+| Arguments   | Type    | Semantics                                           |
+| ----------- | ------- | --------------------------------------------------- |
+| `operand`   | `XlaOp` | n dimensional input array                           |
+| `dimension` | `int64` | A value in the interval `[0, n)` that specifies the |
+:             :         : dimension                                           :
+
 ## GetTupleElement
 
 See also
diff --git a/tensorflow/compiler/xla/g3doc/overview.md b/tensorflow/compiler/xla/g3doc/overview.md
index 6a172c3ae15..d3428b72761 100644
--- a/tensorflow/compiler/xla/g3doc/overview.md
+++ b/tensorflow/compiler/xla/g3doc/overview.md
@@ -4,11 +4,8 @@
 <img style="width:50%" src="./images/xlalogo.png">
 </div>
 
-> Note: XLA is experimental and considered alpha.  Most use cases will not
-> see improvements in performance (speed or decreased memory usage). We have
-> released XLA early so the Open Source Community can contribute to its
-> development, as well as create a path for integration with hardware
-> accelerators.
+> Note: XLA is still under development.  Some use cases will not
+> see improvements in speed or decreased memory usage.
 
 XLA (Accelerated Linear Algebra) is a domain-specific compiler for linear
 algebra that optimizes TensorFlow computations. The results are improvements in
diff --git a/tensorflow/compiler/xla/g3doc/tutorials/xla_compile.ipynb b/tensorflow/compiler/xla/g3doc/tutorials/xla_compile.ipynb
new file mode 100644
index 00000000000..a83e3f78598
--- /dev/null
+++ b/tensorflow/compiler/xla/g3doc/tutorials/xla_compile.ipynb
@@ -0,0 +1,412 @@
+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "f4TSNCvpENrW"
+      },
+      "source": [
+        "##### Copyright 2018 The TensorFlow Authors."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "cellView": "form",
+        "colab": {},
+        "colab_type": "code",
+        "id": "vamNSA0vEP-m"
+      },
+      "outputs": [],
+      "source": [
+        "#@title Licensed under the Apache License, Version 2.0 (the \"License\");\n",
+        "# you may not use this file except in compliance with the License.\n",
+        "# You may obtain a copy of the License at\n",
+        "#\n",
+        "# https://www.apache.org/licenses/LICENSE-2.0\n",
+        "#\n",
+        "# Unless required by applicable law or agreed to in writing, software\n",
+        "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
+        "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
+        "# See the License for the specific language governing permissions and\n",
+        "# limitations under the License."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "cellView": "form",
+        "colab": {},
+        "colab_type": "code",
+        "id": "xD_ydfejEV7H"
+      },
+      "outputs": [],
+      "source": [
+        "#@title MIT License\n",
+        "#\n",
+        "# Copyright (c) 2017 François Chollet\n",
+        "#\n",
+        "# Permission is hereby granted, free of charge, to any person obtaining a\n",
+        "# copy of this software and associated documentation files (the \"Software\"),\n",
+        "# to deal in the Software without restriction, including without limitation\n",
+        "# the rights to use, copy, modify, merge, publish, distribute, sublicense,\n",
+        "# and/or sell copies of the Software, and to permit persons to whom the\n",
+        "# Software is furnished to do so, subject to the following conditions:\n",
+        "#\n",
+        "# The above copyright notice and this permission notice shall be included in\n",
+        "# all copies or substantial portions of the Software.\n",
+        "#\n",
+        "# THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n",
+        "# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n",
+        "# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL\n",
+        "# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n",
+        "# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING\n",
+        "# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER\n",
+        "# DEALINGS IN THE SOFTWARE."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "e1oSi4lHFt3z"
+      },
+      "source": [
+        "# Welcome to `xla.compile()` tutorial"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "b7noD9NjFRL-"
+      },
+      "source": [
+        "\u003ctable class=\"tfo-notebook-buttons\" align=\"left\"\u003e\n",
+        "  \u003ctd\u003e\n",
+        "    \u003ca target=\"_blank\" href=\"https://www.tensorflow.org/xla/jit#turning_on_jit_compilation\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/tf_logo_32px.png\" /\u003eView on TensorFlow.org\u003c/a\u003e\n",
+        "  \u003c/td\u003e\n",
+        "  \u003ctd\u003e\n",
+        "    \u003ca target=\"_blank\" href=\"https://colab.sandbox.google.com/github/tensorflow/compiler/xla/g3doc/tutorials/xla_compile.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" /\u003eRun in Google Colab\u003c/a\u003e\n",
+        "  \u003c/td\u003e\n",
+        "  \u003ctd\u003e\n",
+        "    \u003ca target=\"_blank\" href=\"https://github.com/tensorflow/compiler/xla/g3doc/tutorials/xla_compile.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" /\u003eView source on GitHub\u003c/a\u003e\n",
+        "  \u003c/td\u003e\n",
+        "\u003c/table\u003e"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "v9YbsuLZaBXy"
+      },
+      "source": [
+        "xla.compile() is a new experimental API that compiles part or all of a model with [XLA](https://www.tensorflow.org/extend/xla/).\n",
+        "\n",
+        "Please run all code blocks in order."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "45kUPj5ZFrRa"
+      },
+      "outputs": [],
+      "source": [
+        "import tensorflow as tf"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "9NMQFjroSMns"
+      },
+      "source": [
+        "Imports XLA library, which includes xla.compile() experimental API."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "-Uggy03rSGJm"
+      },
+      "outputs": [],
+      "source": [
+        "from tensorflow.contrib.compiler import xla"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "GZVNiRmTDV-5"
+      },
+      "source": [
+        "Define some necessary constants and prepare MNIST dataset."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "f37TSEGvGX4_"
+      },
+      "outputs": [],
+      "source": [
+        "# Size of each input image, 28 x 28 pixels\n",
+        "IMAGE_SIZE = 28 * 28\n",
+        "# Number of distinct number labels, [0..9]\n",
+        "NUM_CLASSES = 10\n",
+        "# Number of examples in each training batch (step)\n",
+        "TRAIN_BATCH_SIZE = 100\n",
+        "# Number of training steps to run\n",
+        "TRAIN_STEPS = 1000"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "TiVXchblG5hK"
+      },
+      "outputs": [],
+      "source": [
+        "# Loads MNIST dataset.\n",
+        "train, test = tf.keras.datasets.mnist.load_data()\n",
+        "train_ds = tf.data.Dataset.from_tensor_slices(train).batch(TRAIN_BATCH_SIZE).repeat()\n",
+        "test_ds = tf.data.Dataset.from_tensor_slices(test).batch(TRAIN_BATCH_SIZE)\n",
+        "\n",
+        "iterator = tf.data.Iterator.from_structure(train_ds.output_types, train_ds.output_shapes)\n",
+        "images, labels = iterator.get_next()\n",
+        "images = tf.reshape(images, [-1, IMAGE_SIZE])\n",
+        "images, labels = tf.cast(images, tf.float32), tf.cast(labels, tf.int64)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "x_ZehpZP-SfS"
+      },
+      "source": [
+        "## Defines build_mnist_model function to construct model\n",
+        "\n",
+        "Following code block contains a function that constructs a simple model with one dense layer, including both forward and backward propagation.\n",
+        "\n",
+        "When called, it returns two values. `y` is a `tf.Tensor` representing predicted probability of each target class, `train_step` is a `tf.Operation` that increments `global_step` and applies variable update."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "ZbhJl_WvGa3g"
+      },
+      "outputs": [],
+      "source": [
+        "def build_mnist_model(x, y_):\n",
+        "  y = tf.keras.layers.Dense(NUM_CLASSES).apply(x)\n",
+        "\n",
+        "  cross_entropy = tf.losses.sparse_softmax_cross_entropy(labels=y_, logits=y)\n",
+        "  train_step = tf.train.GradientDescentOptimizer(0.5).minimize(cross_entropy)\n",
+        "\n",
+        "  return y, train_step"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "7Jh3lyQHDfM9"
+      },
+      "source": [
+        "## Uses xla.compile with build_mnist_model function to enable XLA"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "EtDwez_1gjzv"
+      },
+      "source": [
+        "Following code block wraps the model with xla.compile(), which allows the target function with provided inputs to be executed by XLA."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "kYpCXCdRHNuN"
+      },
+      "outputs": [],
+      "source": [
+        "[y] = xla.compile(build_mnist_model, inputs=[images, labels])"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "4giQh62IrZGF"
+      },
+      "source": [
+        "When compiling the graph, XLA replaces all the graph nodes constructed in the target function with a few XLA ops.\n",
+        "\n",
+        "xla.compile does not return any\n",
+        "`tf.Operation` nodes that can be executed independently from the generated XLA ops. Instead, returned `tf.Operation` nodes from the target function are added as control dependencies of all returned `tf.Tensor` values. This triggers execution of the `tf.Operation` nodes when the returned tensors are evaluated.\n",
+        "\n",
+        "In pseudo-code, xla.compile's implementation looks as follows:\n",
+        "\n",
+        "---\n",
+        "```\n",
+        "# Ask Tensorflow to execute code in XLA-friendly manner\n",
+        "\n",
+        "y, train_step = build_mnist_model(images, labels)\n",
+        "with tf.control_dependencies([train_step]):\n",
+        "  y = tf.identity(y)\n",
+        "\n",
+        "# Ask Tensorflow to STOP executing code in XLA-friendly manner\n",
+        "```\n",
+        "---\n",
+        "\n",
+        "xla.compile() always returns a list of `tf.Tensor`'s (even if there is only one-element)."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "TPGas4jjFLZl"
+      },
+      "source": [
+        "If you were to print the constructed graph now, you will see that it is not much different from a normal Tensorflow graph and you won't be able to find XLA ops mentioned before. This is because the actual compilation happens later when you try to execute the graph with `sess.run()`.  At that time, Tensorflow triggers a series of graph rewrite passes that actually generate XLA ops, which compiles and executes computation when all inputs are ready."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "EZD1m_n1DxAF"
+      },
+      "source": [
+        "## Trains and tests the model"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "qe28bAHNHUG2"
+      },
+      "outputs": [],
+      "source": [
+        "# Creates session and initialize all variables.\n",
+        "# xla.compile() doesn't work with Keras model.fit() API or TF eager mode yet.\n",
+        "sess = tf.Session()\n",
+        "sess.run(tf.global_variables_initializer())"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "qgsKmz3n2UiW"
+      },
+      "source": [
+        "Following code block trains model.\n",
+        "\n",
+        "Note that evaluating `y` also triggers its control dependency node `train_step`, which updates model variables."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "_GxF6jTRHVuA"
+      },
+      "outputs": [],
+      "source": [
+        "# Feeds training dataset\n",
+        "sess.run(iterator.make_initializer(train_ds))\n",
+        "\n",
+        "# Runs TRAIN_STEPS steps\n",
+        "for i in range(TRAIN_STEPS):\n",
+        "  sess.run(y)\n",
+        "print(\"Model trained for %s steps.\" % TRAIN_STEPS)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "dHlQlRSRHXD1"
+      },
+      "outputs": [],
+      "source": [
+        "# Tests trained model\n",
+        "\n",
+        "# Feeds testing dataset\n",
+        "sess.run(iterator.make_initializer(test_ds))\n",
+        "\n",
+        "# Calculates accuracy\n",
+        "correct_prediction = tf.equal(tf.argmax(y, 1), labels)\n",
+        "accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))\n",
+        "print(\"Prediction accuracy after training: %s\" % sess.run(accuracy))"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "ynJQIuzjHYOb"
+      },
+      "outputs": [],
+      "source": [
+        "# Cleans up session\n",
+        "sess.close()"
+      ]
+    }
+  ],
+  "metadata": {
+    "colab": {
+      "collapsed_sections": [],
+      "name": "xla.compile() Tutorial",
+      "provenance": [],
+      "version": "0.3.2"
+    },
+    "kernelspec": {
+      "display_name": "Python 2",
+      "name": "python2"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
diff --git a/tensorflow/compiler/xla/index_util_test.cc b/tensorflow/compiler/xla/index_util_test.cc
index 93522d2ca87..fa94d0afb4c 100644
--- a/tensorflow/compiler/xla/index_util_test.cc
+++ b/tensorflow/compiler/xla/index_util_test.cc
@@ -24,8 +24,7 @@ limitations under the License.
 namespace xla {
 namespace {
 
-void SetMinorToMajorLayout(Shape* shape,
-                           std::initializer_list<int64> dimensions) {
+void SetMinorToMajorLayout(Shape* shape, std::vector<int64> dimensions) {
   shape->mutable_layout()->clear_minor_to_major();
   for (auto dimension : dimensions) {
     shape->mutable_layout()->add_minor_to_major(dimension);
@@ -122,7 +121,7 @@ TEST(IndexUtilTest, LinearToMultiToLinear) {
   std::vector<int64> linear_indexes = {0,        1439999999, 1145567336,
                                        43883404, 617295214,  1117613654};
 
-  std::vector<std::initializer_list<int64>> minor_to_major_orders;
+  std::vector<std::vector<int64>> minor_to_major_orders;
   minor_to_major_orders.push_back({6, 5, 4, 3, 2, 1, 0});
   minor_to_major_orders.push_back({0, 1, 2, 3, 4, 5, 6});
   minor_to_major_orders.push_back({4, 5, 1, 2, 6, 0, 3});
diff --git a/tensorflow/compiler/xla/legacy_flags/BUILD b/tensorflow/compiler/xla/legacy_flags/BUILD
deleted file mode 100644
index 3e79129aafd..00000000000
--- a/tensorflow/compiler/xla/legacy_flags/BUILD
+++ /dev/null
@@ -1,82 +0,0 @@
-# Legacy command-line flags for the XLA libraries.
-
-# Please do not add more flags to this package.
-
-# The XLA libraries were written in an environment that allowed command-line
-# flags to be scattered freely throughout the libraries.  This model, while
-# initially convenient, leads to a proliferation in unused command-line flags
-# in tests and binaries, and serious problems in servers, where one might wish
-# parameters to be different in independent RPC calls to the same routine.
-#
-# Please don't add more flags.  If you're a library author, pass options and
-# parameters explicitly through the library's interface.
-
-package(default_visibility = ["//tensorflow:internal"])
-
-licenses(["notice"])  # Apache 2.0
-
-load("//tensorflow:tensorflow.bzl", "tf_cc_test")
-
-cc_library(
-    name = "parse_flags_from_env",
-    srcs = ["parse_flags_from_env.cc"],
-    hdrs = ["parse_flags_from_env.h"],
-    deps =
-        [
-            "//tensorflow/compiler/xla:types",
-            "//tensorflow/core:framework_internal",
-            "//tensorflow/core:lib",
-            "@com_google_absl//absl/strings",
-        ],
-)
-
-tf_cc_test(
-    name = "parse_flags_from_env_test",
-    srcs = ["parse_flags_from_env_test.cc"],
-    deps =
-        [
-            ":parse_flags_from_env",
-            "//tensorflow/compiler/xla:types",
-            "//tensorflow/core:framework_internal",
-            "//tensorflow/core:lib",
-            "//tensorflow/core:test",
-            "@com_google_absl//absl/strings:str_format",
-        ],
-)
-
-cc_library(
-    name = "debug_options_flags",
-    srcs = [
-        "debug_options_flags.cc",
-        "debug_options_parsers.h",
-    ],
-    hdrs = ["debug_options_flags.h"],
-    deps =
-        [
-            ":parse_flags_from_env",
-            "//tensorflow/compiler/xla:xla_proto",
-            "//tensorflow/compiler/xla/service:hlo",
-            "//tensorflow/core:framework_internal",
-            "//tensorflow/core:lib",
-            "@com_google_absl//absl/strings",
-        ],
-)
-
-tf_cc_test(
-    name = "debug_options_parsers_test",
-    size = "small",
-    srcs = [
-        "debug_options_parsers.h",
-        "debug_options_parsers_test.cc",
-    ],
-    deps =
-        [
-            "//tensorflow/compiler/xla:xla_proto",
-            "//tensorflow/compiler/xla/service:hlo",
-            "//tensorflow/core:framework_internal",
-            "//tensorflow/core:lib",
-            "//tensorflow/core:test",
-            "@com_google_absl//absl/strings",
-            "@com_google_absl//absl/strings:str_format",
-        ],
-)
diff --git a/tensorflow/compiler/xla/literal.cc b/tensorflow/compiler/xla/literal.cc
index 80dfdb83c35..cb00a0ab16d 100644
--- a/tensorflow/compiler/xla/literal.cc
+++ b/tensorflow/compiler/xla/literal.cc
@@ -1434,10 +1434,14 @@ bool LiteralBase::Piece::EqualElements(const LiteralBase::Piece& other) const {
       return EqualElementsInternal<bool>(other, &multi_index);
     case U8:
       return EqualElementsInternal<uint8>(other, &multi_index);
+    case S16:
+      return EqualElementsInternal<int16>(other, &multi_index);
     case S32:
       return EqualElementsInternal<int32>(other, &multi_index);
     case S64:
       return EqualElementsInternal<int64>(other, &multi_index);
+    case U16:
+      return EqualElementsInternal<uint16>(other, &multi_index);
     case U32:
       return EqualElementsInternal<uint32>(other, &multi_index);
     case U64:
@@ -1506,6 +1510,11 @@ bool LiteralBase::IsAll(int8 value) const {
             return AllElementsEqualValue<uint8>(piece.data<uint8>(), value);
           }
           return false;
+        case U16:
+          if (value >= 0) {
+            return AllElementsEqualValue<uint16>(piece.data<uint16>(), value);
+          }
+          return false;
         case U32:
           if (value >= 0) {
             return AllElementsEqualValue<uint32>(piece.data<uint32>(), value);
@@ -1518,6 +1527,8 @@ bool LiteralBase::IsAll(int8 value) const {
           return false;
         case S8:
           return AllElementsEqualValue<int8>(piece.data<int8>(), value);
+        case S16:
+          return AllElementsEqualValue<int16>(piece.data<int16>(), value);
         case S32:
           return AllElementsEqualValue<int32>(piece.data<int32>(), value);
         case S64:
@@ -1739,12 +1750,16 @@ bool LiteralBase::IsZero(absl::Span<const int64> indices) const {
   switch (shape().element_type()) {
     case U8:
       return Get<uint8>(indices) == 0;
+    case U16:
+      return Get<uint16>(indices) == 0;
     case U32:
       return Get<uint32>(indices) == 0;
     case U64:
       return Get<uint64>(indices) == 0;
     case S8:
       return Get<int8>(indices) == 0;
+    case S16:
+      return Get<int16>(indices) == 0;
     case S32:
       return Get<int32>(indices) == 0;
     case S64:
@@ -1802,6 +1817,20 @@ void LiteralBase::Piece::WriteToProto(LiteralProto* proto) const {
     case S64:
       CopyToRepeatedField(proto->mutable_s64s(), data<int64>());
       break;
+    case U16:
+      *proto->mutable_u16s() = string(
+          reinterpret_cast<const char*>(data<uint16_t>().data()), size_bytes());
+      if (!kLittleEndian) {
+        ConvertEndianShort(proto->mutable_u16s());
+      }
+      break;
+    case S16:
+      *proto->mutable_s16s() = string(
+          reinterpret_cast<const char*>(data<int16_t>().data()), size_bytes());
+      if (!kLittleEndian) {
+        ConvertEndianShort(proto->mutable_s16s());
+      }
+      break;
     case F16:
       *proto->mutable_f16s() = string(
           reinterpret_cast<const char*>(data<half>().data()), size_bytes());
@@ -1916,6 +1945,22 @@ Status LiteralBase::Piece::CopyFromProto(const LiteralProto& proto) {
     case U64:
       TF_RETURN_IF_ERROR(CopyFromRepeatedField(data<uint64>(), proto.u64s()));
       break;
+    case S16: {
+      const string& s(proto.s16s());
+      TF_RET_CHECK(data<int16_t>().size() * sizeof(int16_t) == s.size());
+      memcpy(untyped_data(), s.data(), s.size());
+      if (!kLittleEndian) {
+        ConvertEndianShort(reinterpret_cast<char*>(untyped_data()), s.size());
+      }
+    } break;
+    case U16: {
+      const string& s(proto.u16s());
+      TF_RET_CHECK(data<uint16_t>().size() * sizeof(uint16_t) == s.size());
+      memcpy(untyped_data(), s.data(), s.size());
+      if (!kLittleEndian) {
+        ConvertEndianShort(reinterpret_cast<char*>(untyped_data()), s.size());
+      }
+    } break;
     case F16: {
       const string& s(proto.f16s());
       TF_RET_CHECK(data<half>().size() * sizeof(half) == s.size());
diff --git a/tensorflow/compiler/xla/literal_comparison.cc b/tensorflow/compiler/xla/literal_comparison.cc
index 9d34d9d5041..b044f0ad73f 100644
--- a/tensorflow/compiler/xla/literal_comparison.cc
+++ b/tensorflow/compiler/xla/literal_comparison.cc
@@ -141,8 +141,10 @@ int64 RecursiveElementCount(const Shape& shape) {
       total += RecursiveElementCount(ShapeUtil::GetTupleElementShape(shape, i));
     }
     return total;
-  } else {
+  } else if (ShapeUtil::IsArray(shape)) {
     return ShapeUtil::ElementsIn(shape);
+  } else {
+    return 0;
   }
 }
 
diff --git a/tensorflow/compiler/xla/literal_test.cc b/tensorflow/compiler/xla/literal_test.cc
index 3511760ac1c..8cec37897a9 100644
--- a/tensorflow/compiler/xla/literal_test.cc
+++ b/tensorflow/compiler/xla/literal_test.cc
@@ -1394,6 +1394,28 @@ TEST_F(LiteralUtilTest, CopyFromProto_f16) {
   EXPECT_EQ(h1, r[3]);
 }
 
+TEST_F(LiteralUtilTest, CopyFromProto_u16) {
+  uint16 u1(0xabcd);
+  uint16 u2(0x1234);
+
+  const unsigned char uint16_vals[8] = {0xcd, 0xab, 0x34, 0x12,
+                                        0x34, 0x12, 0xcd, 0xab};
+  LiteralProto p;
+  p.mutable_shape()->set_element_type(U16);
+  p.mutable_shape()->clear_dimensions();
+  p.mutable_shape()->add_dimensions(4);
+  LayoutUtil::SetToDefaultLayout(p.mutable_shape());
+  p.clear_u16s();
+  p.set_u16s(uint16_vals, 8);
+  TF_ASSERT_OK_AND_ASSIGN(Literal literal, Literal::CreateFromProto(p));
+  auto r = literal.data<uint16>();
+  ASSERT_EQ(4, r.size());
+  EXPECT_EQ(u1, r[0]);
+  EXPECT_EQ(u2, r[1]);
+  EXPECT_EQ(u2, r[2]);
+  EXPECT_EQ(u1, r[3]);
+}
+
 TEST_F(LiteralUtilTest, LiteralSliceTest) {
   auto scalar = LiteralUtil::CreateR0<float>(1.0);
   auto matrix = LiteralUtil::CreateR2<float>({{1.0, 2.0}, {3.0, 4.0}});
diff --git a/tensorflow/compiler/xla/legacy_flags/parse_flags_from_env.cc b/tensorflow/compiler/xla/parse_flags_from_env.cc
similarity index 98%
rename from tensorflow/compiler/xla/legacy_flags/parse_flags_from_env.cc
rename to tensorflow/compiler/xla/parse_flags_from_env.cc
index 2a4e49b05aa..40481331b69 100644
--- a/tensorflow/compiler/xla/legacy_flags/parse_flags_from_env.cc
+++ b/tensorflow/compiler/xla/parse_flags_from_env.cc
@@ -22,7 +22,7 @@ limitations under the License.
 #include <string.h>
 #include <vector>
 
-#include "tensorflow/compiler/xla/legacy_flags/parse_flags_from_env.h"
+#include "tensorflow/compiler/xla/parse_flags_from_env.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
@@ -31,7 +31,6 @@ limitations under the License.
 #include "tensorflow/core/util/command_line_flags.h"
 
 namespace xla {
-namespace legacy_flags {
 
 static const char kEnvVar[] = "TF_XLA_FLAGS";  // environment variable queried
 static const char kWS[] = " \t\r\n";           // whitespace
@@ -202,5 +201,4 @@ void ResetFlagsFromEnvForTesting(int** pargc, std::vector<char*>** pargv) {
   *pargv = &env_argv->argv;
 }
 
-}  // namespace legacy_flags
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/legacy_flags/parse_flags_from_env.h b/tensorflow/compiler/xla/parse_flags_from_env.h
similarity index 90%
rename from tensorflow/compiler/xla/legacy_flags/parse_flags_from_env.h
rename to tensorflow/compiler/xla/parse_flags_from_env.h
index b54482ad2ba..fe86ee687f8 100644
--- a/tensorflow/compiler/xla/legacy_flags/parse_flags_from_env.h
+++ b/tensorflow/compiler/xla/parse_flags_from_env.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMPILER_XLA_LEGACY_FLAGS_PARSE_FLAGS_FROM_ENV_H_
-#define TENSORFLOW_COMPILER_XLA_LEGACY_FLAGS_PARSE_FLAGS_FROM_ENV_H_
+#ifndef TENSORFLOW_COMPILER_XLA_PARSE_FLAGS_FROM_ENV_H_
+#define TENSORFLOW_COMPILER_XLA_PARSE_FLAGS_FROM_ENV_H_
 
 // This module exports ParseFlagsFromEnv(), which allows other modules to parse
 // flags from the environtment variable TF_XLA_FLAGS, or (if the first
@@ -50,7 +50,6 @@ limitations under the License.
 #include "tensorflow/core/util/command_line_flags.h"
 
 namespace xla {
-namespace legacy_flags {
 
 // Call tensorflow::Flags::Parse(argc, argv, flag_list) against any as yet
 // unrecognized flags passed in from the environment, and return its
@@ -60,7 +59,6 @@ bool ParseFlagsFromEnv(const std::vector<tensorflow::Flag>& flag_list);
 // Used only for testing.  Not to be used by clients.
 void ResetFlagsFromEnvForTesting(int** pargc, std::vector<char*>** pargv);
 
-}  // namespace legacy_flags
 }  // namespace xla
 
-#endif  // TENSORFLOW_COMPILER_XLA_LEGACY_FLAGS_PARSE_FLAGS_FROM_ENV_H_
+#endif  // TENSORFLOW_COMPILER_XLA_PARSE_FLAGS_FROM_ENV_H_
diff --git a/tensorflow/compiler/xla/legacy_flags/parse_flags_from_env_test.cc b/tensorflow/compiler/xla/parse_flags_from_env_test.cc
similarity index 96%
rename from tensorflow/compiler/xla/legacy_flags/parse_flags_from_env_test.cc
rename to tensorflow/compiler/xla/parse_flags_from_env_test.cc
index 138c0c852e2..edd6538402d 100644
--- a/tensorflow/compiler/xla/legacy_flags/parse_flags_from_env_test.cc
+++ b/tensorflow/compiler/xla/parse_flags_from_env_test.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 // Test for parse_flags_from_env.cc
 
-#include "tensorflow/compiler/xla/legacy_flags/parse_flags_from_env.h"
+#include "tensorflow/compiler/xla/parse_flags_from_env.h"
 
 #include <stdio.h>
 #include <stdlib.h>
@@ -30,7 +30,6 @@ limitations under the License.
 #include "tensorflow/core/util/command_line_flags.h"
 
 namespace xla {
-namespace legacy_flags {
 
 // Test that XLA flags can be set from the environment.
 // Failure messages are accompanied by the text in msg[].
@@ -159,12 +158,11 @@ TEST(ParseFlagsFromEnv, EnvAndFlag) {
   }
 }
 
-}  // namespace legacy_flags
 }  // namespace xla
 
 int main(int argc, char* argv[]) {
   // Save name of binary so that it may invoke itself.
-  xla::legacy_flags::binary_name = argv[0];
+  xla::binary_name = argv[0];
   bool recursing = false;
   xla::int32 int_flag = 1;
   const std::vector<tensorflow::Flag> flag_list = {
@@ -173,7 +171,7 @@ int main(int argc, char* argv[]) {
       tensorflow::Flag("int_flag", &int_flag, "An integer flag to test with"),
   };
   xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
-  bool parse_ok = xla::legacy_flags::ParseFlagsFromEnv(flag_list);
+  bool parse_ok = xla::ParseFlagsFromEnv(flag_list);
   if (!parse_ok) {
     LOG(QFATAL) << "can't parse from environment\n" << usage;
   }
diff --git a/tensorflow/compiler/xla/python/BUILD b/tensorflow/compiler/xla/python/BUILD
index 21685c4a5b9..63ac1c66492 100644
--- a/tensorflow/compiler/xla/python/BUILD
+++ b/tensorflow/compiler/xla/python/BUILD
@@ -3,6 +3,7 @@ licenses(["notice"])  # Apache 2.0
 package(default_visibility = ["//tensorflow:internal"])
 
 load("//tensorflow:tensorflow.bzl", "tf_py_wrap_cc")
+load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda_is_configured")
 
 py_library(
     name = "xla_client",
@@ -66,6 +67,7 @@ cc_library(
         "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/client/lib:math",
+        "//tensorflow/compiler/xla/service:platform_util",
         "//tensorflow/compiler/xla/service:shaped_buffer",
         "//tensorflow/compiler/xrt:xrt_proto",
         "//tensorflow/compiler/xrt/cc:xrt_ops",
@@ -81,6 +83,7 @@ tf_py_wrap_cc(
     srcs = ["xla.i"],
     swig_includes = [
         "local_computation_builder.i",
+        "//tensorflow/python:platform/base.i",
     ],
     deps = [
         ":local_computation_builder",
@@ -89,5 +92,7 @@ tf_py_wrap_cc(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/service:cpu_plugin",
-    ],
+    ] + if_cuda_is_configured([
+        "//tensorflow/compiler/xla/service:gpu_plugin",
+    ]),
 )
diff --git a/tensorflow/compiler/xla/python/local_computation_builder.cc b/tensorflow/compiler/xla/python/local_computation_builder.cc
index b1fae826ab1..4d2a37cfac3 100644
--- a/tensorflow/compiler/xla/python/local_computation_builder.cc
+++ b/tensorflow/compiler/xla/python/local_computation_builder.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/executable_run_options.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/platform_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
@@ -56,6 +57,12 @@ tensorflow::mutex g_local_client_mutex(tensorflow::LINKER_INITIALIZED);
 int g_replica_count GUARDED_BY(g_local_client_mutex) = 1;
 LocalClient* g_local_client GUARDED_BY(g_local_client_mutex) = nullptr;
 
+string* GetPlatformNameString() {
+  static string* platform_name_string PT_GUARDED_BY(g_local_client_mutex) =
+      new string("Host");
+  return platform_name_string;
+}
+
 Status InitializeReplicaCount(int replica_count) {
   if (replica_count < 1) {
     return InvalidArgument("Replica count must be >= 1; got %d.",
@@ -72,17 +79,33 @@ Status InitializeReplicaCount(int replica_count) {
   return Status::OK();
 }
 
+Status InitializePlatformName(const string& platform_name) {
+  string* g_platform_name = GetPlatformNameString();
+  tensorflow::mutex_lock lock(g_local_client_mutex);
+  if (g_local_client != nullptr) {
+    return FailedPrecondition(
+        "Attempted to set the platform name to %s, but a local XLA service was "
+        "previously created with a platform name of %s.",
+        platform_name, *g_platform_name);
+  }
+  TF_RETURN_IF_ERROR(PlatformUtil::GetPlatform(platform_name).status());
+  *g_platform_name = platform_name;
+  return Status::OK();
+}
+
 int GetReplicaCount() {
   tensorflow::mutex_lock lock(g_local_client_mutex);
   return g_replica_count;
 }
 
 LocalClient* GetOrCreateLocalClient() {
+  string* platform_name = GetPlatformNameString();
   tensorflow::mutex_lock lock(g_local_client_mutex);
   if (g_local_client != nullptr) {
     return g_local_client;
   }
   LocalClientOptions options;
+  options.set_platform(PlatformUtil::GetPlatform(*platform_name).ValueOrDie());
   options.set_number_of_replicas(g_replica_count);
   g_local_client = ClientLibrary::GetOrCreateLocalClient(options).ValueOrDie();
   CHECK(g_local_client != nullptr);
diff --git a/tensorflow/compiler/xla/python/local_computation_builder.h b/tensorflow/compiler/xla/python/local_computation_builder.h
index 82f84ddb35b..9e617c48bdc 100644
--- a/tensorflow/compiler/xla/python/local_computation_builder.h
+++ b/tensorflow/compiler/xla/python/local_computation_builder.h
@@ -39,6 +39,12 @@ namespace swig {
 // returned.
 Status InitializeReplicaCount(int replica_count);
 
+// Initializes the platform name that XLA will be initialized with (when
+// first obtaining a handle to the local XLA service). If this is called after
+// the handle to the local XLA service has been established, then an error is
+// returned.
+Status InitializePlatformName(const string& platform_name);
+
 // Returns the replica count that is currently set, regardless of whether the
 // local XLA service has been instantiated yet or not.
 int GetReplicaCount();
diff --git a/tensorflow/compiler/xla/python/local_computation_builder.i b/tensorflow/compiler/xla/python/local_computation_builder.i
index c13d00d2530..feabfdb889c 100644
--- a/tensorflow/compiler/xla/python/local_computation_builder.i
+++ b/tensorflow/compiler/xla/python/local_computation_builder.i
@@ -977,6 +977,7 @@ tensorflow::ImportNumpy();
 %unignore xla;
 %unignore xla::swig;
 %unignore xla::swig::InitializeReplicaCount;
+%unignore xla::swig::InitializePlatformName;
 %unignore xla::swig::GetReplicaCount;
 %unignore xla::swig::TransferToInfeedLocal;
 %unignore xla::swig::TransferToInfeedLocalReplica;
diff --git a/tensorflow/compiler/xla/python/xla_client.py b/tensorflow/compiler/xla/python/xla_client.py
index 07e0e093255..92b0685dbba 100644
--- a/tensorflow/compiler/xla/python/xla_client.py
+++ b/tensorflow/compiler/xla/python/xla_client.py
@@ -1371,6 +1371,18 @@ def initialize_replica_count(replica_count):
   c_api.InitializeReplicaCount(replica_count)
 
 
+def initialize_platform_name(platform_name):
+  """Initializes the desired platform name to use on XLA service init.
+
+  Args:
+    platform_name: string name of platform.
+
+  Raises:
+    A runtime exception if the XLA service has already been initialized.
+  """
+  c_api.InitializePlatformName(platform_name)
+
+
 def get_replica_count():
   """Returns the current replica count used for the XLA service.
 
diff --git a/tensorflow/compiler/xla/rpc/grpc_service.cc b/tensorflow/compiler/xla/rpc/grpc_service.cc
index 4e1435fa30a..d8123a6de28 100644
--- a/tensorflow/compiler/xla/rpc/grpc_service.cc
+++ b/tensorflow/compiler/xla/rpc/grpc_service.cc
@@ -47,11 +47,18 @@ namespace xla {
   });
 }
 
-::grpc::Status GRPCService::ExecuteGraph(::grpc::ServerContext* /*context*/,
-                                         const ExecuteGraphRequest* arg,
-                                         ExecuteResponse* result) {
+::grpc::Status GRPCService::Compile(::grpc::ServerContext* /*context*/,
+                                    const CompileRequest* arg,
+                                    CompileResponse* result) {
   return DelegateRPC(
-      [this, arg, result]() { return service_->ExecuteGraph(arg, result); });
+      [this, arg, result]() { return service_->Compile(arg, result); });
+}
+
+::grpc::Status GRPCService::Execute(::grpc::ServerContext* /*context*/,
+                                    const ExecuteRequest* arg,
+                                    ExecuteResponse* result) {
+  return DelegateRPC(
+      [this, arg, result]() { return service_->Execute(arg, result); });
 }
 
 ::grpc::Status GRPCService::WaitForExecution(::grpc::ServerContext* context,
diff --git a/tensorflow/compiler/xla/rpc/grpc_service.h b/tensorflow/compiler/xla/rpc/grpc_service.h
index ca1b09b6480..3e586b288a5 100644
--- a/tensorflow/compiler/xla/rpc/grpc_service.h
+++ b/tensorflow/compiler/xla/rpc/grpc_service.h
@@ -39,9 +39,13 @@ class GRPCService : public grpc::XlaService::Service {
                                   const DeconstructTupleRequest* arg,
                                   DeconstructTupleResponse* result) override;
 
-  ::grpc::Status ExecuteGraph(::grpc::ServerContext* context,
-                              const ExecuteGraphRequest* arg,
-                              ExecuteResponse* result) override;
+  ::grpc::Status Compile(::grpc::ServerContext* context,
+                         const CompileRequest* arg,
+                         CompileResponse* result) override;
+
+  ::grpc::Status Execute(::grpc::ServerContext* context,
+                         const ExecuteRequest* arg,
+                         ExecuteResponse* result) override;
 
   ::grpc::Status WaitForExecution(::grpc::ServerContext* context,
                                   const WaitForExecutionRequest* arg,
diff --git a/tensorflow/compiler/xla/rpc/grpc_stub.cc b/tensorflow/compiler/xla/rpc/grpc_stub.cc
index 7b8ab158e13..66abf66cfd6 100644
--- a/tensorflow/compiler/xla/rpc/grpc_stub.cc
+++ b/tensorflow/compiler/xla/rpc/grpc_stub.cc
@@ -62,10 +62,17 @@ Status GRPCStub::ResetDevice(const ResetDeviceRequest* request,
   });
 }
 
-Status GRPCStub::ExecuteGraph(const ExecuteGraphRequest* request,
-                              ExecuteResponse* response) {
+Status GRPCStub::Compile(const CompileRequest* request,
+                         CompileResponse* response) {
   return MakeRPC([this, request, response](::grpc::ClientContext* context) {
-    return grpc_stub_->ExecuteGraph(context, *request, response);
+    return grpc_stub_->Compile(context, *request, response);
+  });
+}
+
+Status GRPCStub::Execute(const ExecuteRequest* request,
+                         ExecuteResponse* response) {
+  return MakeRPC([this, request, response](::grpc::ClientContext* context) {
+    return grpc_stub_->Execute(context, *request, response);
   });
 }
 
diff --git a/tensorflow/compiler/xla/rpc/grpc_stub.h b/tensorflow/compiler/xla/rpc/grpc_stub.h
index 8dfcb761387..f02b401399f 100644
--- a/tensorflow/compiler/xla/rpc/grpc_stub.h
+++ b/tensorflow/compiler/xla/rpc/grpc_stub.h
@@ -43,8 +43,11 @@ class GRPCStub : public ServiceInterface {
   Status ResetDevice(const ResetDeviceRequest* arg,
                      ResetDeviceResponse* result) override;
 
-  Status ExecuteGraph(const ExecuteGraphRequest* request,
-                      ExecuteResponse* response) override;
+  Status Compile(const CompileRequest* request,
+                 CompileResponse* response) override;
+
+  Status Execute(const ExecuteRequest* request,
+                 ExecuteResponse* response) override;
 
   Status ExecuteGraphParallel(const ExecuteGraphParallelRequest* request,
                               ExecuteParallelResponse* response) override;
diff --git a/tensorflow/compiler/xla/rpc/xla_service.proto b/tensorflow/compiler/xla/rpc/xla_service.proto
index 551ae895e05..e4f332cda22 100644
--- a/tensorflow/compiler/xla/rpc/xla_service.proto
+++ b/tensorflow/compiler/xla/rpc/xla_service.proto
@@ -128,11 +128,14 @@ service XlaService {
       returns (CreateChannelHandleResponse) {
   }
 
-  // Invokes the provided computation with the provided global data passed as
-  // immutable arguments. The request contains the whole computation graph.
+  // Compiles the provided computation into executable. Returns the handle of
+  // the executable.
+  rpc Compile(CompileRequest) returns (CompileResponse) {}
+
+  // Invokes the provided executable with the provided global data passed as
+  // immutable arguments. The request contains the handle to the executable.
   // Returns global data output and execution timing.
-  rpc ExecuteGraph(ExecuteGraphRequest) returns (ExecuteResponse) {
-  }
+  rpc Execute(ExecuteRequest) returns (ExecuteResponse) {}
 
   // Invokes the provided list of computations in parallel with the provided
   // global data for each computation. Returns a list of global data output and
diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
index cd8c20d43ea..19b5c1ca25d 100644
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@@ -87,7 +87,6 @@ tf_cc_test(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:lib",
     ],
@@ -124,7 +123,6 @@ tf_cc_test(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:lib",
     ],
@@ -158,12 +156,12 @@ tf_cc_test(
         ":bfloat16_propagation",
         ":bfloat16_support",
         ":hlo",
+        "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",  # fixdeps: keep
     ],
@@ -281,7 +279,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/service:hlo_element_type_converter",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",  # fixdeps: keep
         "//tensorflow/core:lib",
@@ -323,7 +321,6 @@ cc_library(
         ":hlo_casting_utils",
         ":hlo_module_config",
         ":hlo_proto",
-        ":hlo_reachability",
         ":name_uniquer",
         "//tensorflow/compiler/xla:array",
         "//tensorflow/compiler/xla:literal",
@@ -365,7 +362,6 @@ tf_cc_test(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:test",
     ],
@@ -402,6 +398,7 @@ cc_library(
     srcs = ["hlo_reachability.cc"],
     hdrs = ["hlo_reachability.h"],
     deps = [
+        ":hlo",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/core:lib",
@@ -420,7 +417,6 @@ tf_cc_test(
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
     ],
 )
@@ -466,7 +462,6 @@ tf_cc_test(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:window_util",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
     ],
 )
@@ -519,7 +514,6 @@ tf_cc_test(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:test",
     ],
@@ -568,7 +562,6 @@ tf_cc_test(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
@@ -591,7 +584,6 @@ tf_cc_test(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:test",
     ],
@@ -603,11 +595,11 @@ cc_library(
     hdrs = ["platform_util.h"],
     deps = [
         ":compiler",
+        "//tensorflow/compiler/xla:debug_options_flags",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
-        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/core:lib",
         "//tensorflow/core:stream_executor_no_cuda",
         "@com_google_absl//absl/strings",
@@ -647,6 +639,7 @@ cc_library(
         ":allocation_tracker",
         ":backend",
         ":channel_tracker",
+        ":compilation_cache",
         ":compiler",
         ":computation_layout",
         ":device_memory_allocator",
@@ -662,6 +655,7 @@ cc_library(
         ":source_map_util",
         ":stream_pool",
         ":transfer_manager",
+        "//tensorflow/compiler/xla:debug_options_flags",
         "//tensorflow/compiler/xla:executable_run_options",
         "//tensorflow/compiler/xla:execution_options_util",
         "//tensorflow/compiler/xla:service_interface",
@@ -673,7 +667,6 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla:xla_proto",
-        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/core:lib",
         "//tensorflow/core:ptr_util",
         "//tensorflow/core:stream_executor_no_cuda",
@@ -730,12 +723,12 @@ cc_library(
         ":computation_layout",
         ":platform_util",
         ":service",
+        "//tensorflow/compiler/xla:debug_options_flags",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:stream_executor_no_cuda",
@@ -811,6 +804,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:ptr_util",
+        "//tensorflow/core:stream_executor_no_cuda",
         "//tensorflow/core:test",
         "@com_google_absl//absl/memory",
     ],
@@ -833,6 +827,7 @@ cc_library(
         ":maybe_owning_device_memory",
         ":shaped_buffer",
         ":stream_pool",
+        "//tensorflow/compiler/xla:debug_options_flags",
         "//tensorflow/compiler/xla:executable_run_options",
         "//tensorflow/compiler/xla:shape_tree",
         "//tensorflow/compiler/xla:status",
@@ -840,7 +835,6 @@ cc_library(
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:stream_executor_no_cuda",
@@ -1086,7 +1080,6 @@ tf_cc_test(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
@@ -1103,6 +1096,7 @@ cc_library(
         ":hlo",
         ":hlo_dataflow_analysis",
         ":hlo_proto",
+        ":hlo_reachability",
         ":hlo_value",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
@@ -1168,7 +1162,6 @@ tf_cc_test(
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
@@ -1388,6 +1381,7 @@ cc_library(
     srcs = ["multi_output_fusion.cc"],
     hdrs = ["multi_output_fusion.h"],
     deps = [
+        ":hlo_reachability",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla/service:hlo",
@@ -1428,7 +1422,6 @@ tf_cc_test(
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:test",
         "@com_google_absl//absl/memory",
@@ -1504,7 +1497,6 @@ tf_cc_test(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:lib",
         "@com_google_absl//absl/memory",
@@ -1556,7 +1548,6 @@ tf_cc_test(
         "//tensorflow/compiler/xla:window_util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",  # fixdeps: keep
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
@@ -1593,7 +1584,6 @@ tf_cc_test(
         "//tensorflow/compiler/xla:window_util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",  # fixdeps: keep
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
@@ -1643,7 +1633,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
     ],
@@ -1695,6 +1685,19 @@ cc_library(
     ],
 )
 
+tf_cc_test(
+    name = "while_loop_analysis_test",
+    srcs = ["while_loop_analysis_test.cc"],
+    deps = [
+        ":while_loop_analysis",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla/service:hlo_parser",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/core:test",
+    ],
+)
+
 cc_library(
     name = "while_loop_simplifier",
     srcs = ["while_loop_simplifier.cc"],
@@ -1703,9 +1706,9 @@ cc_library(
         ":call_inliner",
         ":hlo",
         ":hlo_pass",
+        ":hlo_query",
         ":while_loop_analysis",
         "//tensorflow/compiler/xla:statusor",
-        "//tensorflow/core:lib",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/strings",
@@ -1717,10 +1720,12 @@ tf_cc_test(
     name = "while_loop_simplifier_test",
     srcs = ["while_loop_simplifier_test.cc"],
     deps = [
+        ":hlo",
+        ":hlo_dce",
         ":hlo_matchers",
         ":while_loop_simplifier",
         "//tensorflow/compiler/xla:test",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
         "@com_google_absl//absl/strings",
@@ -1751,7 +1756,7 @@ tf_cc_test(
         ":hlo_matchers",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
     ],
 )
 
@@ -1779,7 +1784,7 @@ tf_cc_test(
         ":implicit_broadcast_remover",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
     ],
 )
 
@@ -1824,7 +1829,6 @@ tf_cc_test(
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
         "//tensorflow/core:test",
     ],
 )
@@ -1858,7 +1862,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:lib",
         "@com_google_absl//absl/memory",
@@ -2264,7 +2268,6 @@ tf_cc_test(
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
@@ -2327,13 +2330,26 @@ tf_cc_test(
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
     ],
 )
 
+cc_library(
+    name = "compilation_cache",
+    srcs = ["compilation_cache.cc"],
+    hdrs = ["compilation_cache.h"],
+    deps = [
+        ":executable",
+        ":hlo_module_config",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/core:lib",
+    ],
+)
+
 cc_library(
     name = "layout_assignment",
     srcs = [
@@ -2403,14 +2419,13 @@ tf_cc_test(
         ":hlo_graph_dumper",
         ":hlo_matchers",
         ":hlo_runner",
+        "//tensorflow/compiler/xla:debug_options_flags",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
         "//tensorflow/core:test",
     ],
 )
@@ -2528,7 +2543,6 @@ tf_cc_test(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:test",
     ],
@@ -2595,7 +2609,6 @@ tf_cc_test(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
         "//tensorflow/compiler/xla/tests:test_utils",
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
@@ -2657,7 +2670,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:test_utils",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:lib",
@@ -2698,7 +2711,6 @@ tf_cc_test(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:test_utils",
         "//tensorflow/core:lib",
@@ -2737,7 +2749,6 @@ tf_cc_test(
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
     ],
@@ -2809,10 +2820,9 @@ tf_cc_test(
         ":hlo_domain_isolator",
         ":hlo_domain_remover",
         ":hlo_parser",
+        "//tensorflow/compiler/xla:debug_options_flags",
         "//tensorflow/compiler/xla:test",
-        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:test",
         "@com_google_absl//absl/memory",
@@ -3000,7 +3010,6 @@ tf_cc_test(
     deps = [
         ":hlo_tfgraph_builder",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:protos_all_cc",
     ],
@@ -3279,6 +3288,8 @@ cc_library(
         ":tuple_util",
         "//tensorflow/compiler/xla:literal_util",
         "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -3305,6 +3316,7 @@ cc_library(
         ":hlo",
         ":hlo_pass",
         ":tuple_util",
+        ":while_loop_analysis",
         ":while_util",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:util",
@@ -3324,7 +3336,7 @@ tf_cc_test(
         ":while_loop_invariant_code_motion",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla/service:hlo_parser",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/core:test",
     ],
 )
@@ -3354,7 +3366,7 @@ tf_cc_test(
         ":while_loop_constant_sinking",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla/service:hlo_parser",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/core:test",
     ],
 )
@@ -3415,7 +3427,7 @@ tf_cc_test(
         ":indexed_array_analysis",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla/service:hlo_parser",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:test_utils",
         "//tensorflow/core:test",
     ],
@@ -3512,7 +3524,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",  # fixdeps: keep
         "@com_google_absl//absl/memory",
diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier.cc b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
index 85fc42f7475..89e62bd2f0d 100644
--- a/tensorflow/compiler/xla/service/algebraic_simplifier.cc
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/algebraic_simplifier.h"
 
 #include <algorithm>
+#include <iterator>
 #include <memory>
 #include <numeric>
 #include <string>
@@ -107,6 +108,8 @@ class AlgebraicSimplifierVisitor : public DfsHloVisitorWithDefault {
 
   Status HandleAdd(HloInstruction* add) override;
 
+  Status HandleAnd(HloInstruction* logical_and) override;
+
   Status HandleBitcast(HloInstruction* bitcast) override;
 
   Status HandleBitcastConvert(HloInstruction* bitcast) override;
@@ -141,6 +144,12 @@ class AlgebraicSimplifierVisitor : public DfsHloVisitorWithDefault {
 
   Status HandleMultiply(HloInstruction* multiply) override;
 
+  Status HandleNegate(HloInstruction* negate) override;
+
+  Status HandleNot(HloInstruction* logical_not) override;
+
+  Status HandleOr(HloInstruction* logical_or) override;
+
   Status HandlePad(HloInstruction* pad) override;
 
   Status HandlePower(HloInstruction* power) override;
@@ -306,9 +315,11 @@ class AlgebraicSimplifierVisitor : public DfsHloVisitorWithDefault {
   // Tries to use a kDot in place of the given convolution.
   StatusOr<bool> SimplifyConvToDot(HloInstruction* convolution);
 
-  // Tries to simplify a slice(pad(...)) where the result of the slice is a
-  // scalar.
-  StatusOr<bool> TrySimplifySliceOfPad(HloInstruction* slice);
+  // Tries to simplify a slice where the result of the slice is a scalar.
+  StatusOr<bool> TrySimplifyScalarSlice(HloInstruction* slice);
+
+  // Tries to convert slice(reshape(X)) into reshape(slice(X))
+  StatusOr<bool> TryToReorderSliceAndReshape(HloInstruction* slice);
 
   // Current HloComputation instance the AlgebraicSimplifierVisitor is
   // traversing.
@@ -423,6 +434,43 @@ Status AlgebraicSimplifierVisitor::HandleAdd(HloInstruction* add) {
   return Status::OK();
 }
 
+Status AlgebraicSimplifierVisitor::HandleAnd(HloInstruction* logical_and) {
+  HloInstruction *lhs, *rhs;
+  CHECK(Match(logical_and, m::And(m::Op(&lhs), m::Op(&rhs))));
+  // Simplify logical and
+  if (ShapeUtil::HasPrimitiveType(lhs->shape(), xla::PRED) &&
+      ShapeUtil::HasPrimitiveType(rhs->shape(), xla::PRED)) {
+    // A && True => A
+    VLOG(10) << "trying transform [A && True => A]: "
+             << logical_and->ToString();
+    if (IsAll(rhs, 1) && ReplaceInstructionIfSameShape(logical_and, lhs)) {
+      return Status::OK();
+    }
+    // True && A => A
+    VLOG(10) << "trying transform [True && A => A]: "
+             << logical_and->ToString();
+    if (IsAll(lhs, 1) && ReplaceInstructionIfSameShape(logical_and, rhs)) {
+      return Status::OK();
+    }
+
+    // A && False => False
+    VLOG(10) << "trying transform [A && False => False]: "
+             << logical_and->ToString();
+    if (IsAll(rhs, 0) && ReplaceInstructionIfSameShape(logical_and, rhs)) {
+      return Status::OK();
+    }
+
+    // False && A => False
+    VLOG(10) << "trying transform [False && A => False]: "
+             << logical_and->ToString();
+    if (IsAll(lhs, 0) && ReplaceInstructionIfSameShape(logical_and, lhs)) {
+      return Status::OK();
+    }
+  }
+
+  return Status::OK();
+}
+
 Status AlgebraicSimplifierVisitor::HandleBitcast(HloInstruction* bitcast) {
   // If a bitcast feeds a bitcast, make it a single bitcast.
   HloInstruction* op;
@@ -1229,6 +1277,64 @@ Status AlgebraicSimplifierVisitor::HandleMultiply(HloInstruction* multiply) {
   return Status::OK();
 }
 
+Status AlgebraicSimplifierVisitor::HandleNegate(HloInstruction* negate) {
+  // negate(negate(x)) => x
+  HloInstruction* x;
+  if (Match(negate, m::Negate(m::Negate(m::Op(&x)))) &&
+      ReplaceInstructionIfSameShape(negate, x)) {
+    return Status::OK();
+  }
+  return Status::OK();
+}
+
+Status AlgebraicSimplifierVisitor::HandleNot(HloInstruction* logical_not) {
+  // not(not(x)) => x
+  HloInstruction* x;
+  if (Match(logical_not, m::Not(m::Not(m::Op(&x)))) &&
+      ReplaceInstructionIfSameShape(logical_not, x)) {
+    return Status::OK();
+  }
+  return Status::OK();
+}
+
+Status AlgebraicSimplifierVisitor::HandleOr(HloInstruction* logical_or) {
+  HloInstruction *lhs, *rhs;
+  CHECK(Match(logical_or, m::Or(m::Op(&lhs), m::Op(&rhs))));
+
+  // Simplify logical or
+  if (ShapeUtil::HasPrimitiveType(lhs->shape(), xla::PRED) &&
+      ShapeUtil::HasPrimitiveType(rhs->shape(), xla::PRED)) {
+    // A || True => True
+    VLOG(10) << "trying transform [A || True => True]: "
+             << logical_or->ToString();
+    if (IsAll(rhs, 1) && ReplaceInstructionIfSameShape(logical_or, rhs)) {
+      return Status::OK();
+    }
+    // True || A => True
+    VLOG(10) << "trying transform [True || A => True]: "
+             << logical_or->ToString();
+    if (IsAll(lhs, 1) && ReplaceInstructionIfSameShape(logical_or, lhs)) {
+      return Status::OK();
+    }
+
+    // A || False => A
+    VLOG(10) << "trying transform [A || False => A]: "
+             << logical_or->ToString();
+    if (IsAll(rhs, 0) && ReplaceInstructionIfSameShape(logical_or, lhs)) {
+      return Status::OK();
+    }
+
+    // False || A => A
+    VLOG(10) << "trying transform [False || A => A]: "
+             << logical_or->ToString();
+    if (IsAll(lhs, 0) && ReplaceInstructionIfSameShape(logical_or, rhs)) {
+      return Status::OK();
+    }
+  }
+
+  return Status::OK();
+}
+
 Status AlgebraicSimplifierVisitor::HandleLog(HloInstruction* log) {
   // ln(exp(A)) => A
   VLOG(10) << "trying transform [ln(exp(A)) => A]: " << log->ToString();
@@ -1826,60 +1932,160 @@ Status AlgebraicSimplifierVisitor::HandleReverse(HloInstruction* reverse) {
   return Status::OK();
 }
 
-StatusOr<bool> AlgebraicSimplifierVisitor::TrySimplifySliceOfPad(
+StatusOr<bool> AlgebraicSimplifierVisitor::TrySimplifyScalarSlice(
     HloInstruction* slice) {
   // Only try to do this for effective scalars. We could do the same for slicing
   // out larger pieces of padding (replacing with a broadcast of the padding
   // value), but this is probably not worth it.
-  if (!ShapeUtil::IsEffectiveScalar(slice->shape()) ||
-      slice->operand(0)->opcode() != HloOpcode::kPad) {
+  if (!ShapeUtil::IsEffectiveScalar(slice->shape())) {
     return false;
   }
 
-  VLOG(10) << "Trying to simplify scalar slice of pad";
-  // Check there's no internal padding. Again, we could handle that too, since
-  // everything is statically known, but it's not worth it.
-  auto pad = Cast<HloPadInstruction>(slice->mutable_operand(0));
-  auto padding_config = pad->padding_config();
-  int64 rank = padding_config.dimensions_size();
-  if (HasInteriorPadding(padding_config)) {
-    VLOG(10) << "Not folding scalar slice of pad, pad has interior padding";
-    return false;
+  if (slice->operand(0)->opcode() == HloOpcode::kPad) {
+    VLOG(10) << "Trying to simplify scalar slice of pad";
+    // Check there's no internal padding. Again, we could handle that too, since
+    // everything is statically known, but it's not worth it.
+    auto pad = Cast<HloPadInstruction>(slice->mutable_operand(0));
+    auto padding_config = pad->padding_config();
+    int64 rank = padding_config.dimensions_size();
+    if (HasInteriorPadding(padding_config)) {
+      VLOG(10) << "Not folding scalar slice of pad, pad has interior padding";
+      return false;
+    }
+
+    // Check whether the scalar we're slicing out falls into the padding.
+    bool in_padding = [&]() {
+      for (int64 i = 0; i < rank; ++i) {
+        int64 start = slice->slice_starts(i);
+        int64 low = padding_config.dimensions(i).edge_padding_low();
+        int64 data = pad->operand(0)->shape().dimensions(i);
+        if (start >= low && start < low + data) {
+          return false;
+        }
+      }
+      return true;
+    }();
+
+    if (in_padding) {
+      VLOG(10) << "Folding scalar slice of pad into padding value";
+      TF_RETURN_IF_ERROR(ReplaceWithNewInstruction(
+          slice, HloInstruction::CreateReshape(slice->shape(),
+                                               pad->mutable_padding_value())));
+      return true;
+    } else {
+      // We already know the output of the slice is scalar. If the padded
+      // value is scalar, and it's not in the padding, then it's exactly the
+      // output value.
+      bool replaced =
+          ReplaceInstructionIfSameShape(slice, pad->mutable_operand(0));
+      if (replaced) {
+        VLOG(10) << "Folding scalar slice of pad into padded value";
+      } else {
+        VLOG(10) << "Not folding scalar slice of pad into padded value as they "
+                    "have different shapes.";
+      }
+      return replaced;
+    }
   }
 
-  // Check whether the scalar we're slicing out falls into the padding.
-  bool in_padding = [&]() {
-    for (int64 i = 0; i < rank; ++i) {
-      int64 start = slice->slice_starts(i);
-      int64 low = padding_config.dimensions(i).edge_padding_low();
-      int64 data = pad->operand(0)->shape().dimensions(i);
-      if (start >= low && start < low + data) {
-        return false;
+  if (slice->operand(0)->opcode() == HloOpcode::kConcatenate) {
+    VLOG(10) << "Trying to simplify scalar slice of concat";
+    // Only do this for R1, there's no chance of this being useful otherwise.
+    if (ShapeUtil::Rank(slice->shape()) != 1) {
+      VLOG(10) << "Not folding, slice is not rank 1";
+      return false;
+    }
+    HloConcatenateInstruction* concat =
+        Cast<HloConcatenateInstruction>(slice->mutable_operand(0));
+    int64 operand_start = 0;
+    int64 operand_num = 0;
+    // Weird loop structure to avoid annoying off-by-one errors.
+    while (true) {
+      TF_RET_CHECK(operand_num < concat->operand_count());
+      const HloInstruction* operand = concat->operand(operand_num);
+      int64 next_operand_start = operand_start + operand->shape().dimensions(0);
+      if (next_operand_start > slice->slice_starts(0)) {
+        break;
+      }
+      operand_start = next_operand_start;
+      operand_num++;
+    }
+
+    bool replaced = ReplaceInstructionIfSameShape(
+        slice, concat->mutable_operand(operand_num));
+    if (replaced) {
+      VLOG(10) << "Folding scalar slice of concat into concat operand";
+    } else {
+      VLOG(10) << "Folding scalar slice of concat into slice of concat operand";
+      TF_RETURN_IF_ERROR(ReplaceWithNewInstruction(
+          slice, HloInstruction::CreateSlice(
+                     slice->shape(), concat->mutable_operand(operand_num),
+                     {slice->slice_starts(0) - operand_start},
+                     {slice->slice_starts(0) - operand_start + 1},
+                     slice->slice_strides())));
+    }
+    return true;
+  }
+
+  return false;
+}
+
+bool IsUnstridedSlice(const HloInstruction* hlo) {
+  return absl::c_all_of(hlo->slice_strides(),
+                        [](int64 stride) { return stride == 1; });
+}
+
+StatusOr<bool> AlgebraicSimplifierVisitor::TryToReorderSliceAndReshape(
+    HloInstruction* slice) {
+  CHECK_EQ(slice->opcode(), HloOpcode::kSlice);
+  if (!IsUnstridedSlice(slice)) {
+    return false;
+  }
+  HloInstruction* reshape = slice->mutable_operand(0);
+  if (reshape->opcode() != HloOpcode::kReshape) {
+    return false;
+  }
+  HloInstruction* new_slice_operand = reshape->mutable_operand(0);
+  int64 slice_rank = ShapeUtil::Rank(slice->shape());
+  std::vector<int64> sliced_dims;
+  for (int64 i = 0; i < slice_rank; ++i) {
+    if (slice->slice_starts(i) != 0 ||
+        slice->slice_limits(i) != reshape->shape().dimensions(i)) {
+      sliced_dims.push_back(i);
+    }
+  }
+
+  if (sliced_dims.size() == 1 && sliced_dims[0] == 0 &&
+      slice->slice_starts(0) == 0) {
+    const Shape& new_slice_shape = new_slice_operand->shape();
+    const int64 rank = ShapeUtil::Rank(new_slice_shape);
+    std::vector<int64> new_slice_starts(rank, 0);
+    std::vector<int64> new_slice_stides(rank, 1);
+    std::vector<int64> new_slice_limits(new_slice_shape.dimensions().begin(),
+                                        new_slice_shape.dimensions().end());
+    int64 slice_elements = ShapeUtil::ElementsIn(slice->shape());
+    for (int64 i = rank - 1; i >= 0; --i) {
+      if (slice_elements >= new_slice_limits[i]) {
+        if (slice_elements % new_slice_limits[i] != 0) {
+          return false;
+        }
+        slice_elements /= new_slice_limits[i];
+      } else {
+        new_slice_limits[i] = slice_elements;
+        slice_elements = 1;
       }
     }
-    return true;
-  }();
-
-  if (in_padding) {
-    VLOG(10) << "Folding scalar slice of pad into padding value";
+    HloInstruction* new_slice =
+        computation_->AddInstruction(HloInstruction::CreateSlice(
+            ShapeUtil::MakeShape(new_slice_shape.element_type(),
+                                 new_slice_limits),
+            new_slice_operand, new_slice_starts, new_slice_limits,
+            new_slice_stides));
     TF_RETURN_IF_ERROR(ReplaceWithNewInstruction(
-        slice, HloInstruction::CreateReshape(slice->shape(),
-                                             pad->mutable_padding_value())));
+        slice, HloInstruction::CreateReshape(slice->shape(), new_slice)));
     return true;
-  } else {
-    // We already know the output of the slice is scalar. If the padded
-    // value is scalar, and it's not in the padding, then it's exactly the
-    // output value.
-    bool replaced =
-        ReplaceInstructionIfSameShape(slice, pad->mutable_operand(0));
-    if (replaced) {
-      VLOG(10) << "Folding scalar slice of pad into padded value";
-    } else {
-      VLOG(10) << "Not folding scalar slice of pad into padded value as they "
-                  "have different shapes.";
-    }
-    return replaced;
   }
+  return false;
 }
 
 Status AlgebraicSimplifierVisitor::HandleSlice(HloInstruction* slice) {
@@ -1888,12 +2094,8 @@ Status AlgebraicSimplifierVisitor::HandleSlice(HloInstruction* slice) {
     return Status::OK();
   }
 
-  auto is_unstrided_slice = [](const HloInstruction* hlo) {
-    return absl::c_all_of(hlo->slice_strides(),
-                          [](int64 stride) { return stride == 1; });
-  };
   if (slice->operand(0)->opcode() == HloOpcode::kSlice &&
-      is_unstrided_slice(slice) && is_unstrided_slice(slice->operand(0))) {
+      IsUnstridedSlice(slice) && IsUnstridedSlice(slice->operand(0))) {
     HloInstruction* operand_slice = slice->mutable_operand(0);
     std::vector<int64> new_slice_starts = slice->slice_starts();
     std::vector<int64> new_slice_limits = slice->slice_limits();
@@ -1907,11 +2109,15 @@ Status AlgebraicSimplifierVisitor::HandleSlice(HloInstruction* slice) {
                    new_slice_starts, new_slice_limits, slice->slice_strides()));
   }
 
-  TF_ASSIGN_OR_RETURN(bool replaced, TrySimplifySliceOfPad(slice));
+  TF_ASSIGN_OR_RETURN(bool replaced, TrySimplifyScalarSlice(slice));
   if (replaced) {
     return Status::OK();
   }
 
+  TF_ASSIGN_OR_RETURN(replaced, TryToReorderSliceAndReshape(slice));
+  if (replaced) {
+    return Status::OK();
+  }
   return Status::OK();
 }
 
diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc b/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
index 7b3e957fbcf..e4c4da1b0e7 100644
--- a/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
@@ -33,7 +33,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
-#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/window_util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
@@ -54,10 +53,11 @@ AlgebraicSimplifier::ValidBitcastCallback non_bitcasting_callback() {
   return [](const Shape&, const Shape&) { return false; };
 }
 
-class AlgebraicSimplifierTest : public HloVerifiedTestBase {};
+class AlgebraicSimplifierTest : public HloTestBase {};
 
 // Test that A + 0 is simplified to A
 TEST_F(AlgebraicSimplifierTest, AddZero) {
+  auto m = CreateNewVerifiedModule();
   Shape r0f32 = ShapeUtil::MakeShape(F32, {});
   HloComputation::Builder builder(TestName());
   HloInstruction* param0 = builder.AddInstruction(
@@ -67,18 +67,19 @@ TEST_F(AlgebraicSimplifierTest, AddZero) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32, HloOpcode::kAdd, param0, zero));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kAdd);
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   root = computation->root_instruction();
   EXPECT_EQ(root, param0);
 }
 
 // Test that A * 0 is simplified to 0
 TEST_F(AlgebraicSimplifierTest, MulZero) {
+  auto m = CreateNewVerifiedModule();
   Shape r0s32 = ShapeUtil::MakeShape(S32, {});
   HloComputation::Builder builder(TestName());
   HloInstruction* param0 = builder.AddInstruction(
@@ -88,12 +89,12 @@ TEST_F(AlgebraicSimplifierTest, MulZero) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r0s32, HloOpcode::kMultiply, param0, zero));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kMultiply);
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   EXPECT_EQ(computation->root_instruction(), zero);
 }
 
@@ -166,6 +167,7 @@ TEST_F(AlgebraicSimplifierTest, SelectIdentical) {
 
 // Test that Reduce(Reduce(A)) -> Reduce(A)
 TEST_F(AlgebraicSimplifierTest, TwoReducesToOne) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   // Create add computation.
   HloInstruction* zero = builder.AddInstruction(
@@ -180,7 +182,7 @@ TEST_F(AlgebraicSimplifierTest, TwoReducesToOne) {
         HloInstruction::CreateParameter(1, scalar_shape, "p1"));
     builder.AddInstruction(
         HloInstruction::CreateBinary(scalar_shape, HloOpcode::kAdd, p0, p1));
-    add_computation = module().AddEmbeddedComputation(builder.Build());
+    add_computation = m->AddEmbeddedComputation(builder.Build());
   }
   Shape r4f32 = ShapeUtil::MakeShape(F32, {4, 5, 6, 7});
   HloInstruction* param = builder.AddInstruction(
@@ -193,17 +195,18 @@ TEST_F(AlgebraicSimplifierTest, TwoReducesToOne) {
   Shape r1f32 = ShapeUtil::MakeShape(F32, {5});
   builder.AddInstruction(HloInstruction::CreateReduce(r1f32, reduce0, zero,
                                                       dims1, add_computation));
-  module().AddEntryComputation(builder.Build());
+  m->AddEntryComputation(builder.Build());
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
-  HloInstruction* root = module().entry_computation()->root_instruction();
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
+  HloInstruction* root = m->entry_computation()->root_instruction();
   EXPECT_THAT(root, op::Reduce(param, zero));
   EXPECT_EQ(root->dimensions(), std::vector<int64>({0, 2, 3}));
 }
 
 // Test that Const + A is canonicalized to A + Const.
 TEST_F(AlgebraicSimplifierTest, AddConstOnLHS) {
+  auto m = CreateNewVerifiedModule();
   Shape r0f32 = ShapeUtil::MakeShape(F32, {});
   HloComputation::Builder builder(TestName());
   HloInstruction* param0 = builder.AddInstruction(
@@ -213,18 +216,19 @@ TEST_F(AlgebraicSimplifierTest, AddConstOnLHS) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32, HloOpcode::kAdd, constant, param0));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kAdd);
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   root = computation->root_instruction();
   EXPECT_THAT(root, op::Add(param0, op::Constant()));
 }
 
 // Test that [(A + C1) + C2] => [A + (C1 + C2)] for constants C1 and C2.
 TEST_F(AlgebraicSimplifierTest, AddReassociateMergeConstants) {
+  auto m = CreateNewVerifiedModule();
   Shape r0f32 = ShapeUtil::MakeShape(F32, {});
   HloComputation::Builder builder(TestName());
   HloInstruction* param0 = builder.AddInstruction(
@@ -239,17 +243,18 @@ TEST_F(AlgebraicSimplifierTest, AddReassociateMergeConstants) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32, HloOpcode::kAdd, add1, constant2));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kAdd);
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   root = computation->root_instruction();
   EXPECT_THAT(root, op::Add(param0, op::Add(constant1, constant2)));
 }
 
 TEST_F(AlgebraicSimplifierTest, AddBroadcastZeroR0Operand) {
+  auto m = CreateNewVerifiedModule();
   Shape r2f32 = ShapeUtil::MakeShape(F32, {3, 2});
   HloComputation::Builder builder(TestName());
   HloInstruction* param0 = builder.AddInstruction(
@@ -261,17 +266,18 @@ TEST_F(AlgebraicSimplifierTest, AddBroadcastZeroR0Operand) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r2f32, HloOpcode::kAdd, bcast, param0));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kAdd);
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   root = computation->root_instruction();
   EXPECT_EQ(root, param0);
 }
 
 TEST_F(AlgebraicSimplifierTest, InlineTrivialMap) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   // Create add computation.
   HloComputation* add_computation = nullptr;
@@ -284,7 +290,7 @@ TEST_F(AlgebraicSimplifierTest, InlineTrivialMap) {
         HloInstruction::CreateParameter(1, scalar_shape, "p1"));
     builder.AddInstruction(
         HloInstruction::CreateBinary(scalar_shape, HloOpcode::kAdd, p0, p1));
-    add_computation = module().AddEmbeddedComputation(builder.Build());
+    add_computation = m->AddEmbeddedComputation(builder.Build());
   }
   Shape r2f32 = ShapeUtil::MakeShape(F32, {32, 1});
   HloInstruction* param0 = builder.AddInstruction(
@@ -297,17 +303,18 @@ TEST_F(AlgebraicSimplifierTest, InlineTrivialMap) {
                    HloInstruction::CreateBroadcast(r2f32, zero, {}))},
       add_computation));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kMap);
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   root = computation->root_instruction();
   EXPECT_THAT(root, op::Add(param0, op::Broadcast(zero)));
 }
 
 TEST_F(AlgebraicSimplifierTest, AddBroadcastZeroR1Operand) {
+  auto m = CreateNewVerifiedModule();
   Shape r2f32 = ShapeUtil::MakeShape(F32, {3, 2});
   HloComputation::Builder builder(TestName());
   HloInstruction* param0 = builder.AddInstruction(
@@ -319,64 +326,68 @@ TEST_F(AlgebraicSimplifierTest, AddBroadcastZeroR1Operand) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r2f32, HloOpcode::kAdd, bcast, param0));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kAdd);
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   root = computation->root_instruction();
   EXPECT_EQ(root, param0);
 }
 
 TEST_F(AlgebraicSimplifierTest, ConstantToBroadcast) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   builder.AddInstruction(HloInstruction::CreateConstant(
       LiteralUtil::CreateR1<float>({3.14f, 3.14f, 3.14f})));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_THAT(root, op::Constant());
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   root = computation->root_instruction();
   EXPECT_THAT(root, op::Broadcast(op::Constant()));
   EXPECT_EQ(3.14f, root->operand(0)->literal().GetFirstElement<float>());
 }
 
 TEST_F(AlgebraicSimplifierTest, ConstantNotToBroadcast) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   builder.AddInstruction(HloInstruction::CreateConstant(
       LiteralUtil::CreateR1<float>({3.14, 3.14, 4})));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_THAT(root, op::Constant());
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_FALSE(simplifier.Run(&module()).ValueOrDie());
+  ASSERT_FALSE(simplifier.Run(m.get()).ValueOrDie());
   root = computation->root_instruction();
   EXPECT_THAT(root, op::Constant());
 }
 
 TEST_F(AlgebraicSimplifierTest, IotaToBroadcast) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   builder.AddInstruction(HloInstruction::CreateConstant(
       LiteralUtil::CreateR1<float>({0.0f, 1.0f, 2.0f})));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_THAT(root, op::Constant());
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   root = computation->root_instruction();
   EXPECT_THAT(root, op::Iota());
 }
 
 // Test that A - 0 is simplified to A
 TEST_F(AlgebraicSimplifierTest, SubZero) {
+  auto m = CreateNewVerifiedModule();
   Shape r0f32 = ShapeUtil::MakeShape(F32, {});
   HloComputation::Builder builder(TestName());
   HloInstruction* param0 = builder.AddInstruction(
@@ -386,18 +397,19 @@ TEST_F(AlgebraicSimplifierTest, SubZero) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32, HloOpcode::kSubtract, param0, zero));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kSubtract);
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   root = computation->root_instruction();
   EXPECT_EQ(root, param0);
 }
 
 // Test that A - Const is canonicalized to A + (-Const).
 TEST_F(AlgebraicSimplifierTest, SubConstCanonicalization) {
+  auto m = CreateNewVerifiedModule();
   Shape r0f32 = ShapeUtil::MakeShape(F32, {});
   HloComputation::Builder builder(TestName());
   HloInstruction* param0 = builder.AddInstruction(
@@ -407,18 +419,19 @@ TEST_F(AlgebraicSimplifierTest, SubConstCanonicalization) {
   builder.AddInstruction(HloInstruction::CreateBinary(
       r0f32, HloOpcode::kSubtract, param0, constant));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kSubtract);
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   root = computation->root_instruction();
   EXPECT_THAT(root, op::Add(param0, op::Negate(constant)));
 }
 
 // Test that (A/B)/C is simplified to A/(B*C).
 TEST_F(AlgebraicSimplifierTest, LhsDivOfDiv) {
+  auto m = CreateNewVerifiedModule();
   Shape r0f32 = ShapeUtil::MakeShape(F32, {});
   HloComputation::Builder builder(TestName());
   HloInstruction* param0 = builder.AddInstruction(
@@ -432,14 +445,14 @@ TEST_F(AlgebraicSimplifierTest, LhsDivOfDiv) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32, HloOpcode::kDivide, div, param2));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Divide(op::Divide(param0, param1), param2));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Divide(param0, op::Multiply(param1, param2)));
@@ -447,6 +460,7 @@ TEST_F(AlgebraicSimplifierTest, LhsDivOfDiv) {
 
 // Test that A/(B/C) is simplified to (A*C)/B.
 TEST_F(AlgebraicSimplifierTest, RhsDivOfDiv) {
+  auto m = CreateNewVerifiedModule();
   Shape r0f32 = ShapeUtil::MakeShape(F32, {});
   HloComputation::Builder builder(TestName());
   HloInstruction* param0 = builder.AddInstruction(
@@ -460,14 +474,14 @@ TEST_F(AlgebraicSimplifierTest, RhsDivOfDiv) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32, HloOpcode::kDivide, param0, div));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Divide(param0, op::Divide(param1, param2)));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Divide(op::Multiply(param0, param2), param1));
@@ -475,6 +489,7 @@ TEST_F(AlgebraicSimplifierTest, RhsDivOfDiv) {
 
 // Test that (A/B)/(C/D) is simplified to (A*D)/(B*C).
 TEST_F(AlgebraicSimplifierTest, DivOfDivAndDiv) {
+  auto m = CreateNewVerifiedModule();
   Shape r2f32 = ShapeUtil::MakeShape(F32, {42, 123});
   HloComputation::Builder builder(TestName());
   HloInstruction* param0 = builder.AddInstruction(
@@ -492,7 +507,7 @@ TEST_F(AlgebraicSimplifierTest, DivOfDivAndDiv) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r2f32, HloOpcode::kDivide, div0, div1));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(
       computation->root_instruction(),
@@ -500,7 +515,7 @@ TEST_F(AlgebraicSimplifierTest, DivOfDivAndDiv) {
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(
       computation->root_instruction(),
@@ -509,6 +524,7 @@ TEST_F(AlgebraicSimplifierTest, DivOfDivAndDiv) {
 
 // Test that A/exp(B) is simplified to A*exp(-B).
 TEST_F(AlgebraicSimplifierTest, DivOfExp) {
+  auto m = CreateNewVerifiedModule();
   Shape r0f32 = ShapeUtil::MakeShape(F32, {});
   HloComputation::Builder builder(TestName());
   HloInstruction* param0 = builder.AddInstruction(
@@ -520,14 +536,14 @@ TEST_F(AlgebraicSimplifierTest, DivOfExp) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32, HloOpcode::kDivide, param0, exp));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Divide(param0, op::Exp(param1)));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Multiply(param0, op::Exp(op::Negate(param1))));
@@ -535,6 +551,7 @@ TEST_F(AlgebraicSimplifierTest, DivOfExp) {
 
 // Test that A/pow(B,C) is simplified to A*pow(B,-C).
 TEST_F(AlgebraicSimplifierTest, DivOfPower) {
+  auto m = CreateNewVerifiedModule();
   Shape r0f32 = ShapeUtil::MakeShape(F32, {});
   HloComputation::Builder builder(TestName());
   HloInstruction* param0 = builder.AddInstruction(
@@ -548,14 +565,14 @@ TEST_F(AlgebraicSimplifierTest, DivOfPower) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32, HloOpcode::kDivide, param0, power));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Divide(param0, op::Power(param1, param2)));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Multiply(param0, op::Power(param1, op::Negate(param2))));
@@ -564,6 +581,7 @@ TEST_F(AlgebraicSimplifierTest, DivOfPower) {
 // Test that broadcasting is done on the right step when simplifying A/pow(B,C)
 // to A*pow(B,-C).
 TEST_F(AlgebraicSimplifierTest, DivOfBroadcastingPower) {
+  auto m = CreateNewVerifiedModule();
   Shape r1f32 = ShapeUtil::MakeShape(F32, {7});
   HloComputation::Builder builder(TestName());
   HloInstruction* param0 = builder.AddInstruction(
@@ -577,14 +595,14 @@ TEST_F(AlgebraicSimplifierTest, DivOfBroadcastingPower) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r1f32, HloOpcode::kDivide, param0, power));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Divide(param0, op::Power(param1, param2)));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   ASSERT_THAT(computation->root_instruction(),
               op::Multiply(param0, op::Power(param1, op::Negate(param2))));
@@ -592,6 +610,7 @@ TEST_F(AlgebraicSimplifierTest, DivOfBroadcastingPower) {
 
 // A / Const => A * InvertedConst
 TEST_F(AlgebraicSimplifierTest, DivideByConstant) {
+  auto m = CreateNewVerifiedModule();
   Shape r1f32 = ShapeUtil::MakeShape(F32, {3});
   HloComputation::Builder builder(TestName());
   HloInstruction* param0 = builder.AddInstruction(
@@ -602,11 +621,11 @@ TEST_F(AlgebraicSimplifierTest, DivideByConstant) {
   builder.AddInstruction(HloInstruction::CreateBinary(r1f32, HloOpcode::kDivide,
                                                       param0, constant));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Multiply(param0, op::Constant()));
@@ -614,6 +633,7 @@ TEST_F(AlgebraicSimplifierTest, DivideByConstant) {
 
 // pow(pow(A, X), Y) => pow(A, X*Y)
 TEST_F(AlgebraicSimplifierTest, PowerOfPower) {
+  auto m = CreateNewVerifiedModule();
   Shape r1f32 = ShapeUtil::MakeShape(F32, {7});
   HloComputation::Builder builder(TestName());
   HloInstruction* base = builder.AddInstruction(
@@ -627,10 +647,10 @@ TEST_F(AlgebraicSimplifierTest, PowerOfPower) {
   builder.AddInstruction(HloInstruction::CreateBinary(r1f32, HloOpcode::kPower,
                                                       inner_power, exp2));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   EXPECT_THAT(computation->root_instruction(),
               op::Power(base, op::Multiply(exp1, exp2)));
 }
@@ -638,6 +658,7 @@ TEST_F(AlgebraicSimplifierTest, PowerOfPower) {
 // Don't simplify pow(pow(A, X), Y) => pow(A, X*Y) if X and Y are complex
 // numbers.
 TEST_F(AlgebraicSimplifierTest, PowerOfPowerComplex) {
+  auto m = CreateNewVerifiedModule();
   Shape r1c64 = ShapeUtil::MakeShape(C64, {7});
   HloComputation::Builder builder(TestName());
   HloInstruction* base = builder.AddInstruction(
@@ -651,14 +672,15 @@ TEST_F(AlgebraicSimplifierTest, PowerOfPowerComplex) {
   builder.AddInstruction(HloInstruction::CreateBinary(r1c64, HloOpcode::kPower,
                                                       inner_power, exp2));
 
-  module().AddEntryComputation(builder.Build());
+  m->AddEntryComputation(builder.Build());
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_FALSE(simplifier.Run(&module()).ValueOrDie());
+  ASSERT_FALSE(simplifier.Run(m.get()).ValueOrDie());
 }
 
 // Test that A/1 is simplified to A for a scalar.
 TEST_F(AlgebraicSimplifierTest, DivOneScalar) {
+  auto m = CreateNewVerifiedModule();
   Shape r0f32 = ShapeUtil::MakeShape(F32, {});
   HloComputation::Builder builder(TestName());
   HloInstruction* param0 = builder.AddInstruction(
@@ -668,18 +690,19 @@ TEST_F(AlgebraicSimplifierTest, DivOneScalar) {
   HloInstruction* div = builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32, HloOpcode::kDivide, param0, one));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root, div);
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   root = computation->root_instruction();
   EXPECT_EQ(root, param0);
 }
 
 // Test that A/1 is simplified to A for an array.
 TEST_F(AlgebraicSimplifierTest, DivOneArray) {
+  auto m = CreateNewVerifiedModule();
   Shape r2f32 = ShapeUtil::MakeShape(F32, {2, 2});
   HloComputation::Builder builder(TestName());
   HloInstruction* param0 = builder.AddInstruction(
@@ -689,18 +712,19 @@ TEST_F(AlgebraicSimplifierTest, DivOneArray) {
   HloInstruction* div = builder.AddInstruction(
       HloInstruction::CreateBinary(r2f32, HloOpcode::kDivide, param0, one));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root, div);
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   root = computation->root_instruction();
   EXPECT_EQ(root, param0);
 }
 
 // Test that complex(real(c), imag(c)) is simplified to c.
 TEST_F(AlgebraicSimplifierTest, ComplexOfRealImagC) {
+  auto m = CreateNewVerifiedModule();
   Shape r2f32 = ShapeUtil::MakeShape(F32, {2, 2});
   Shape r2c64 = ShapeUtil::MakeShape(C64, {2, 2});
   HloComputation::Builder builder(TestName());
@@ -713,18 +737,19 @@ TEST_F(AlgebraicSimplifierTest, ComplexOfRealImagC) {
   HloInstruction* cplx = builder.AddInstruction(
       HloInstruction::CreateBinary(r2c64, HloOpcode::kComplex, real, imag));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root, cplx);
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   root = computation->root_instruction();
   EXPECT_EQ(root, param0);
 }
 
 // Test that real(complex(r,i)) is simplified to r.
 TEST_F(AlgebraicSimplifierTest, RealOfComplex) {
+  auto m = CreateNewVerifiedModule();
   Shape r2f32 = ShapeUtil::MakeShape(F32, {2, 2});
   HloComputation::Builder builder(TestName());
   HloInstruction* param0 = builder.AddInstruction(
@@ -737,18 +762,19 @@ TEST_F(AlgebraicSimplifierTest, RealOfComplex) {
   HloInstruction* real = builder.AddInstruction(
       HloInstruction::CreateUnary(r2f32, HloOpcode::kReal, cplx));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root, real);
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   root = computation->root_instruction();
   EXPECT_EQ(root, param0);
 }
 
 // Test that imag(complex(r,i)) is simplified to i.
 TEST_F(AlgebraicSimplifierTest, ImagOfComplex) {
+  auto m = CreateNewVerifiedModule();
   Shape r2f32 = ShapeUtil::MakeShape(F32, {2, 2});
   HloComputation::Builder builder(TestName());
   HloInstruction* param0 = builder.AddInstruction(
@@ -761,18 +787,19 @@ TEST_F(AlgebraicSimplifierTest, ImagOfComplex) {
   HloInstruction* imag = builder.AddInstruction(
       HloInstruction::CreateUnary(r2f32, HloOpcode::kImag, cplx));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root, imag);
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   root = computation->root_instruction();
   EXPECT_EQ(root, param1);
 }
 
 // Test that get_element(make_tuple({A,B}),1) is simplified to B
 TEST_F(AlgebraicSimplifierTest, SelectMakeTuple) {
+  auto m = CreateNewVerifiedModule();
   Shape r0f32 = ShapeUtil::MakeShape(F32, {});
   HloComputation::Builder builder(TestName());
   HloInstruction* param0 = builder.AddInstruction(
@@ -788,18 +815,19 @@ TEST_F(AlgebraicSimplifierTest, SelectMakeTuple) {
   HloInstruction* add = builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32, HloOpcode::kAdd, get, param2));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root, add);
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   root = computation->root_instruction();
   EXPECT_THAT(root, op::Add(param1, param2));
 }
 
 // Test that exp(A)/exp(B) is simplified to exp(A-B)
 TEST_F(AlgebraicSimplifierTest, ExpDiv) {
+  auto m = CreateNewVerifiedModule();
   Shape r0f32 = ShapeUtil::MakeShape(F32, {});
   HloComputation::Builder builder(TestName());
   HloInstruction* param0 = builder.AddInstruction(
@@ -813,14 +841,14 @@ TEST_F(AlgebraicSimplifierTest, ExpDiv) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32, HloOpcode::kDivide, exp0, exp1));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Divide(op::Exp(param0), op::Exp(param1)));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Exp(op::Subtract(param0, param1)));
@@ -828,6 +856,7 @@ TEST_F(AlgebraicSimplifierTest, ExpDiv) {
 
 // Test that exp(A)*exp(B) is simplified to exp(A+B)
 TEST_F(AlgebraicSimplifierTest, ExpMul) {
+  auto m = CreateNewVerifiedModule();
   Shape r0f32 = ShapeUtil::MakeShape(F32, {});
   HloComputation::Builder builder(TestName());
   HloInstruction* param0 = builder.AddInstruction(
@@ -841,14 +870,14 @@ TEST_F(AlgebraicSimplifierTest, ExpMul) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32, HloOpcode::kMultiply, exp0, exp1));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Multiply(op::Exp(param0), op::Exp(param1)));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Exp(op::Add(param0, param1)));
@@ -856,6 +885,7 @@ TEST_F(AlgebraicSimplifierTest, ExpMul) {
 
 // Test that pow(exp(A), B) is simplified to exp(A*B)
 TEST_F(AlgebraicSimplifierTest, PowExp) {
+  auto m = CreateNewVerifiedModule();
   Shape r0f32 = ShapeUtil::MakeShape(F32, {});
   HloComputation::Builder builder(TestName());
   HloInstruction* param0 = builder.AddInstruction(
@@ -867,14 +897,14 @@ TEST_F(AlgebraicSimplifierTest, PowExp) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32, HloOpcode::kPower, exp0, param1));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Power(op::Exp(param0), param1));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Exp(op::Multiply(param0, param1)));
@@ -882,6 +912,7 @@ TEST_F(AlgebraicSimplifierTest, PowExp) {
 
 // Test that ln(pow(A, B)) is simplified to ln(A)*B
 TEST_F(AlgebraicSimplifierTest, LnPow) {
+  auto m = CreateNewVerifiedModule();
   Shape r0f32 = ShapeUtil::MakeShape(F32, {});
   HloComputation::Builder builder(TestName());
   HloInstruction* param0 = builder.AddInstruction(
@@ -893,14 +924,14 @@ TEST_F(AlgebraicSimplifierTest, LnPow) {
   builder.AddInstruction(
       HloInstruction::CreateUnary(r0f32, HloOpcode::kLog, pow));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Log(op::Power(param0, param1)));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Multiply(op::Log(param0), param1));
@@ -908,6 +939,7 @@ TEST_F(AlgebraicSimplifierTest, LnPow) {
 
 // Test that ln(exp(A)) is simplified to A
 TEST_F(AlgebraicSimplifierTest, LnExp) {
+  auto m = CreateNewVerifiedModule();
   Shape r0f32 = ShapeUtil::MakeShape(F32, {});
   HloComputation::Builder builder(TestName());
   HloInstruction* param0 = builder.AddInstruction(
@@ -917,19 +949,20 @@ TEST_F(AlgebraicSimplifierTest, LnExp) {
   builder.AddInstruction(
       HloInstruction::CreateUnary(r0f32, HloOpcode::kLog, exp0));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(), op::Log(op::Exp(param0)));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   EXPECT_EQ(computation->root_instruction(), param0);
 }
 
 // Test that ln(exp(A)/exp(B)) is simplified to A-B
 TEST_F(AlgebraicSimplifierTest, LnExpDiv) {
+  auto m = CreateNewVerifiedModule();
   Shape r0f32 = ShapeUtil::MakeShape(F32, {});
   HloComputation::Builder builder(TestName());
   HloInstruction* param0 = builder.AddInstruction(
@@ -945,14 +978,14 @@ TEST_F(AlgebraicSimplifierTest, LnExpDiv) {
   builder.AddInstruction(
       HloInstruction::CreateUnary(r0f32, HloOpcode::kLog, div));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Log(op::Divide(op::Exp(param0), op::Exp(param1))));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(), op::Subtract(param0, param1));
 }
@@ -960,6 +993,7 @@ TEST_F(AlgebraicSimplifierTest, LnExpDiv) {
 // Test that pow(A, 0) where A is a scalar is simplified to the scalar
 // constant 1.
 TEST_F(AlgebraicSimplifierTest, Pow0Scalar) {
+  auto m = CreateNewVerifiedModule();
   Shape r0f32 = ShapeUtil::MakeShape(F32, {});
   HloComputation::Builder builder(TestName());
   HloInstruction* param0 = builder.AddInstruction(
@@ -969,13 +1003,13 @@ TEST_F(AlgebraicSimplifierTest, Pow0Scalar) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32, HloOpcode::kPower, param0, zero));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(), op::Power(param0, zero));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   HloInstruction* root = computation->root_instruction();
   EXPECT_THAT(root, op::Constant());
@@ -984,6 +1018,7 @@ TEST_F(AlgebraicSimplifierTest, Pow0Scalar) {
 
 // Test that pow(A, 0) where A is not a scalar is simplified to broadcast(1).
 TEST_F(AlgebraicSimplifierTest, Pow0Vector) {
+  auto m = CreateNewVerifiedModule();
   Shape r1f32 = ShapeUtil::MakeShape(F32, {42});
   HloComputation::Builder builder(TestName());
   HloInstruction* param0 = builder.AddInstruction(
@@ -993,13 +1028,13 @@ TEST_F(AlgebraicSimplifierTest, Pow0Vector) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r1f32, HloOpcode::kPower, param0, zero));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(), op::Power(param0, zero));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   HloInstruction* root = computation->root_instruction();
   EXPECT_THAT(root, op::Broadcast());
@@ -1012,6 +1047,7 @@ TEST_F(AlgebraicSimplifierTest, Pow0Vector) {
 
 // Test that pow(A, 1) is simplified to A.
 TEST_F(AlgebraicSimplifierTest, Pow1) {
+  auto m = CreateNewVerifiedModule();
   Shape r0f32 = ShapeUtil::MakeShape(F32, {});
   HloComputation::Builder builder(TestName());
   HloInstruction* param0 = builder.AddInstruction(
@@ -1021,19 +1057,20 @@ TEST_F(AlgebraicSimplifierTest, Pow1) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32, HloOpcode::kPower, param0, one));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(), op::Power(param0, one));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   EXPECT_EQ(computation->root_instruction(), param0);
 }
 
 // Test that pow(A, 2) is simplified to A*A.
 TEST_F(AlgebraicSimplifierTest, Pow2) {
+  auto m = CreateNewVerifiedModule();
   Shape r0f32 = ShapeUtil::MakeShape(F32, {});
   HloComputation::Builder builder(TestName());
   HloInstruction* param0 = builder.AddInstruction(
@@ -1043,19 +1080,20 @@ TEST_F(AlgebraicSimplifierTest, Pow2) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32, HloOpcode::kPower, param0, two));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(), op::Power(param0, two));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(), op::Multiply(param0, param0));
 }
 
 // Test that pow(A, -1) is simplified to 1/A.
 TEST_F(AlgebraicSimplifierTest, PowNegative1) {
+  auto m = CreateNewVerifiedModule();
   Shape r0f32 = ShapeUtil::MakeShape(F32, {});
   HloComputation::Builder builder(TestName());
   HloInstruction* param0 = builder.AddInstruction(
@@ -1065,13 +1103,13 @@ TEST_F(AlgebraicSimplifierTest, PowNegative1) {
   builder.AddInstruction(HloInstruction::CreateBinary(r0f32, HloOpcode::kPower,
                                                       param0, negative_one));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(), op::Power(param0, negative_one));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   HloInstruction* root = computation->root_instruction();
   EXPECT_THAT(root, op::Divide(op::Broadcast(), param0));
@@ -1081,6 +1119,7 @@ TEST_F(AlgebraicSimplifierTest, PowNegative1) {
 }
 
 TEST_F(AlgebraicSimplifierTest, ZeroSizedConvolution) {
+  auto m = CreateNewVerifiedModule();
   auto builder = HloComputation::Builder(TestName());
   HloInstruction* lhs = builder.AddInstruction(HloInstruction::CreateParameter(
       0, ShapeUtil::MakeShape(F32, {3, 3, 0}), "lhs"));
@@ -1113,17 +1152,18 @@ TEST_F(AlgebraicSimplifierTest, ZeroSizedConvolution) {
   builder.AddInstruction(HloInstruction::CreateConvolve(
       ShapeUtil::MakeShape(F32, {3, 3, 3}), lhs, rhs, /*feature_group_count=*/1,
       window, dnums, DefaultPrecisionConfig(2)));
-  module().AddEntryComputation(builder.Build());
+  m->AddEntryComputation(builder.Build());
   HloPassFix<AlgebraicSimplifier> simplifier(/*is_layout_sensitive=*/false,
                                              non_bitcasting_callback());
-  EXPECT_THAT(module().entry_computation()->root_instruction(),
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
               op::Convolution(lhs, rhs));
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
-  EXPECT_THAT(module().entry_computation()->root_instruction(),
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
               op::Broadcast(op::Constant()));
 }
 
 TEST_F(AlgebraicSimplifierTest, ZeroSizedReduceWindow) {
+  auto m = CreateNewVerifiedModule();
   auto builder = HloComputation::Builder(TestName());
   HloInstruction* param =
       builder.AddInstruction(HloInstruction::CreateParameter(
@@ -1148,24 +1188,25 @@ TEST_F(AlgebraicSimplifierTest, ZeroSizedReduceWindow) {
         HloInstruction::CreateParameter(1, scalar_shape, "p1"));
     builder.AddInstruction(
         HloInstruction::CreateBinary(scalar_shape, HloOpcode::kAdd, p0, p1));
-    add_computation = module().AddEmbeddedComputation(builder.Build());
+    add_computation = m->AddEmbeddedComputation(builder.Build());
   }
   builder.AddInstruction(HloInstruction::CreateReduceWindow(
       ShapeUtil::MakeShape(F32, {5, 2}), param,
       builder.AddInstruction(
           HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(0.0f))),
       window, add_computation));
-  module().AddEntryComputation(builder.Build());
+  m->AddEntryComputation(builder.Build());
   HloPassFix<AlgebraicSimplifier> simplifier(/*is_layout_sensitive=*/false,
                                              non_bitcasting_callback());
-  EXPECT_THAT(module().entry_computation()->root_instruction(),
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
               op::ReduceWindow(param, op::Constant()));
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
-  EXPECT_THAT(module().entry_computation()->root_instruction(),
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
               op::Broadcast(op::Constant()));
 }
 
 TEST_F(AlgebraicSimplifierTest, ZeroSizedPad) {
+  auto m = CreateNewVerifiedModule();
   auto builder = HloComputation::Builder(TestName());
   HloInstruction* param =
       builder.AddInstruction(HloInstruction::CreateParameter(
@@ -1182,17 +1223,18 @@ TEST_F(AlgebraicSimplifierTest, ZeroSizedPad) {
       builder.AddInstruction(
           HloInstruction::CreateConstant(LiteralUtil::CreateR0(0.0f))),
       padding));
-  module().AddEntryComputation(builder.Build());
-  EXPECT_THAT(module().entry_computation()->root_instruction(),
+  m->AddEntryComputation(builder.Build());
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
               op::Pad(param, op::Constant()));
   HloPassFix<AlgebraicSimplifier> simplifier(/*is_layout_sensitive=*/false,
                                              non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
-  EXPECT_THAT(module().entry_computation()->root_instruction(),
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
               op::Broadcast(op::Constant()));
 }
 
 TEST_F(AlgebraicSimplifierTest, ReshapeBroadcast) {
+  auto m = CreateNewVerifiedModule();
   Shape r0f32 = ShapeUtil::MakeShape(F32, {});
 
   auto builder = HloComputation::Builder(TestName());
@@ -1206,39 +1248,41 @@ TEST_F(AlgebraicSimplifierTest, ReshapeBroadcast) {
       ShapeUtil::MakeShape(F32, {3, 2}), broadcast));
 
   auto computation = builder.Build();
-  module().AddEntryComputation(std::move(computation));
+  m->AddEntryComputation(std::move(computation));
 
-  EXPECT_THAT(module().entry_computation()->root_instruction(),
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
               op::Reshape(op::Broadcast(op::Reshape(op))));
 
   HloPassFix<AlgebraicSimplifier> simplifier(/*is_layout_sensitive=*/false,
                                              non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
-  EXPECT_THAT(module().entry_computation()->root_instruction(), op);
+  EXPECT_THAT(m->entry_computation()->root_instruction(), op);
 }
 
 // Test that convert(A, $TYPE) is simplified to A if A is of type $TYPE.
 TEST_F(AlgebraicSimplifierTest, ConvertBetweenSameType) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   HloInstruction* input = builder.AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(42.0f)));
   builder.AddInstruction(
       HloInstruction::CreateConvert(ShapeUtil::MakeShape(F32, {}), input));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(), op::Convert(input));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(), input);
 }
 
 // Test that copies are removed.
 TEST_F(AlgebraicSimplifierTest, RemoveCopy) {
+  auto m = CreateNewVerifiedModule();
   Shape r0f32 = ShapeUtil::MakeShape(F32, {});
   HloComputation::Builder builder(TestName());
   HloInstruction* param0 = builder.AddInstruction(
@@ -1246,18 +1290,19 @@ TEST_F(AlgebraicSimplifierTest, RemoveCopy) {
   builder.AddInstruction(
       HloInstruction::CreateUnary(param0->shape(), HloOpcode::kCopy, param0));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(), op::Copy(param0));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(), param0);
 }
 
 TEST_F(AlgebraicSimplifierTest, CopyEqualsBitcast) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   HloInstruction* param =
       builder.AddInstruction(HloInstruction::CreateParameter(
@@ -1268,24 +1313,25 @@ TEST_F(AlgebraicSimplifierTest, CopyEqualsBitcast) {
       ShapeUtil::MakeShape(F32, {1, 14, 14, 64}), HloOpcode::kCopy, param));
   *copy->mutable_shape()->mutable_layout() =
       LayoutUtil::MakeLayout({1, 2, 0, 3});
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
   EXPECT_THAT(computation->root_instruction(), op::Copy(param));
 
   AlgebraicSimplifier simplifier1(/*is_layout_sensitive=*/true,
                                   non_bitcasting_callback());
-  ASSERT_FALSE(simplifier1.Run(&module()).ValueOrDie());
+  ASSERT_FALSE(simplifier1.Run(m.get()).ValueOrDie());
   // Verify that the copy is not replaced.
   EXPECT_THAT(computation->root_instruction(), op::Copy(param));
 
   AlgebraicSimplifier simplifier2(/*is_layout_sensitive=*/true,
                                   bitcasting_callback());
-  ASSERT_TRUE(simplifier2.Run(&module()).ValueOrDie());
+  ASSERT_TRUE(simplifier2.Run(m.get()).ValueOrDie());
   // Verify that the copy is replaced.
   EXPECT_THAT(computation->root_instruction(), op::Bitcast(param));
 }
 
 // Test that unary concatenates are removed.
 TEST_F(AlgebraicSimplifierTest, RemoveUnaryConcatenate) {
+  auto m = CreateNewVerifiedModule();
   Shape r1f32 = ShapeUtil::MakeShape(F32, {100});
   HloComputation::Builder builder(TestName());
   HloInstruction* param0 = builder.AddInstruction(
@@ -1293,19 +1339,20 @@ TEST_F(AlgebraicSimplifierTest, RemoveUnaryConcatenate) {
   builder.AddInstruction(
       HloInstruction::CreateConcatenate(param0->shape(), {param0}, 0));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(), op::Concatenate(param0));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(), param0);
 }
 
 // Test that empty operands of concatenates are removed.
 TEST_F(AlgebraicSimplifierTest, RemoveEmptyConcatenateOperands) {
+  auto m = CreateNewVerifiedModule();
   const int kParamLength = 100;
   Shape r1f32 = ShapeUtil::MakeShape(F32, {kParamLength});
   HloComputation::Builder builder(TestName());
@@ -1322,7 +1369,7 @@ TEST_F(AlgebraicSimplifierTest, RemoveEmptyConcatenateOperands) {
   builder.AddInstruction(HloInstruction::CreateConcatenate(
       result_shape, {empty_literal, param0, param0, empty_slice, param1}, 0));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(
       computation->root_instruction(),
@@ -1330,7 +1377,7 @@ TEST_F(AlgebraicSimplifierTest, RemoveEmptyConcatenateOperands) {
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Concatenate(param0, param0, param1));
@@ -1338,6 +1385,7 @@ TEST_F(AlgebraicSimplifierTest, RemoveEmptyConcatenateOperands) {
 
 // Test that reduce of concat is simplified.
 TEST_F(AlgebraicSimplifierTest, SimplifyReduceOfConcat) {
+  auto m = CreateNewVerifiedModule();
   const int kParamLength = 100;
   Shape r3f32 =
       ShapeUtil::MakeShape(F32, {kParamLength, kParamLength, kParamLength});
@@ -1363,7 +1411,7 @@ TEST_F(AlgebraicSimplifierTest, SimplifyReduceOfConcat) {
         HloInstruction::CreateParameter(1, scalar_shape, "p1"));
     builder.AddInstruction(
         HloInstruction::CreateBinary(scalar_shape, HloOpcode::kAdd, p0, p1));
-    add_computation = module().AddEmbeddedComputation(builder.Build());
+    add_computation = m->AddEmbeddedComputation(builder.Build());
   }
   Shape r4f32 = ShapeUtil::MakeShape(F32, {4, 5, 6, 7});
   Shape reduce_shape = ShapeUtil::MakeShape(F32, {kParamLength});
@@ -1373,11 +1421,11 @@ TEST_F(AlgebraicSimplifierTest, SimplifyReduceOfConcat) {
   builder.AddInstruction(HloInstruction::CreateReduce(
       reduce_shape, Concatenate, zero, {1, 2}, add_computation));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(
       computation->root_instruction(),
@@ -1387,6 +1435,7 @@ TEST_F(AlgebraicSimplifierTest, SimplifyReduceOfConcat) {
 
 // Test a concatenate with only empty operands is removed.
 TEST_F(AlgebraicSimplifierTest, OnlyEmptyConcatenateOperands) {
+  auto m = CreateNewVerifiedModule();
   const int kParamLength = 100;
   Shape r1f32 = ShapeUtil::MakeShape(F32, {kParamLength});
   HloComputation::Builder builder(TestName());
@@ -1401,20 +1450,21 @@ TEST_F(AlgebraicSimplifierTest, OnlyEmptyConcatenateOperands) {
   builder.AddInstruction(HloInstruction::CreateConcatenate(
       result_shape, {empty_literal, empty_slice}, 0));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Concatenate(empty_literal, empty_slice));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   EXPECT_EQ(computation->root_instruction(), empty_literal);
 }
 
 // Test that concat with a scalar broadcast becomes a pad.
 TEST_F(AlgebraicSimplifierTest, ConcatenateOfBroadcastBecomesPad) {
+  auto m = CreateNewVerifiedModule();
   Shape r1f32 = ShapeUtil::MakeShape(F32, {100});
   Shape r0f32 = ShapeUtil::MakeShape(F32, {});
   HloComputation::Builder builder(TestName());
@@ -1427,17 +1477,18 @@ TEST_F(AlgebraicSimplifierTest, ConcatenateOfBroadcastBecomesPad) {
   builder.AddInstruction(HloInstruction::CreateConcatenate(
       ShapeUtil::MakeShape(F32, {200}), {broadcast, param0}, 0));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   EXPECT_THAT(computation->root_instruction(), op::Pad(param0, param1));
 }
 
 // Test that a simplification which changes layouts is not performed if layout
 // sensitive is true.
 TEST_F(AlgebraicSimplifierTest, CopyWithDifferentLayout) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   HloInstruction* param0 =
       builder.AddInstruction(HloInstruction::CreateParameter(
@@ -1445,7 +1496,7 @@ TEST_F(AlgebraicSimplifierTest, CopyWithDifferentLayout) {
   HloInstruction* copy = builder.AddInstruction(
       HloInstruction::CreateUnary(param0->shape(), HloOpcode::kCopy, param0));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
   // Set to different layouts.
   *param0->mutable_shape()->mutable_layout() = LayoutUtil::MakeLayout({0, 1});
@@ -1455,7 +1506,7 @@ TEST_F(AlgebraicSimplifierTest, CopyWithDifferentLayout) {
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/true,
                                  non_bitcasting_callback());
-  EXPECT_FALSE(simplifier.Run(&module()).ValueOrDie());
+  EXPECT_FALSE(simplifier.Run(m.get()).ValueOrDie());
 
   // Copy has not been removed.
   EXPECT_THAT(computation->root_instruction(), op::Copy(param0));
@@ -1464,6 +1515,7 @@ TEST_F(AlgebraicSimplifierTest, CopyWithDifferentLayout) {
 // Test that a simplification which preserves layouts is performed if layout
 // sensitive is true.
 TEST_F(AlgebraicSimplifierTest, CopyWithSameLayout) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   HloInstruction* param0 =
       builder.AddInstruction(HloInstruction::CreateParameter(
@@ -1471,7 +1523,7 @@ TEST_F(AlgebraicSimplifierTest, CopyWithSameLayout) {
   HloInstruction* copy = builder.AddInstruction(
       HloInstruction::CreateUnary(param0->shape(), HloOpcode::kCopy, param0));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
   // Set to same layouts.
   *param0->mutable_shape()->mutable_layout() = LayoutUtil::MakeLayout({0, 1});
@@ -1481,7 +1533,7 @@ TEST_F(AlgebraicSimplifierTest, CopyWithSameLayout) {
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/true,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   // Copy has been removed.
   EXPECT_THAT(computation->root_instruction(), param0);
@@ -1490,6 +1542,7 @@ TEST_F(AlgebraicSimplifierTest, CopyWithSameLayout) {
 // Test that a reshape which could be replaced with a bitcast is not if
 // add_bitcasts is false.
 TEST_F(AlgebraicSimplifierTest, NoBitcastAdded) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   HloInstruction* param0 =
       builder.AddInstruction(HloInstruction::CreateParameter(
@@ -1502,13 +1555,13 @@ TEST_F(AlgebraicSimplifierTest, NoBitcastAdded) {
   *reshape->mutable_shape()->mutable_layout() =
       LayoutUtil::MakeLayout({0, 1, 2, 3, 4, 5});
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(), op::Reshape(param0));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/true,
                                  non_bitcasting_callback());
-  EXPECT_FALSE(simplifier.Run(&module()).ValueOrDie());
+  EXPECT_FALSE(simplifier.Run(m.get()).ValueOrDie());
 
   // Reshape is not replaced with a bitcast.
   EXPECT_THAT(computation->root_instruction(), op::Reshape(param0));
@@ -1516,6 +1569,7 @@ TEST_F(AlgebraicSimplifierTest, NoBitcastAdded) {
 
 // Test transforming reshapes and transposes of rng.
 TEST_F(AlgebraicSimplifierTest, ReshapeOfTransposeOfRngToRng) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   HloInstruction* zero = builder.AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(0.0f)));
@@ -1532,11 +1586,11 @@ TEST_F(AlgebraicSimplifierTest, ReshapeOfTransposeOfRngToRng) {
                                 ShapeUtil::MakeShape(F32, {4}), transpose))
                             ->shape();
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  bitcasting_callback());
-  EXPECT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  EXPECT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   // Verify that that reshape(transpose(rng)) is replace by a single rng of the
   // same shape as the reshape.
@@ -1547,6 +1601,7 @@ TEST_F(AlgebraicSimplifierTest, ReshapeOfTransposeOfRngToRng) {
 
 // Test transforming reshapes to bitcasts under various conditions.
 TEST_F(AlgebraicSimplifierTest, ReshapeReplacedWithBitcast) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   HloInstruction* param0 =
       builder.AddInstruction(HloInstruction::CreateParameter(
@@ -1578,7 +1633,7 @@ TEST_F(AlgebraicSimplifierTest, ReshapeReplacedWithBitcast) {
   builder.AddInstruction(HloInstruction::CreateTuple(
       {transformable_reshape, dimensions_wrong_reshape, layout_wrong_reshape}));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Tuple(transformable_reshape, dimensions_wrong_reshape,
@@ -1586,7 +1641,7 @@ TEST_F(AlgebraicSimplifierTest, ReshapeReplacedWithBitcast) {
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/true,
                                  bitcasting_callback());
-  simplifier.Run(&module()).ValueOrDie();
+  simplifier.Run(m.get()).ValueOrDie();
 
   // Verify that only the first reshape is replaced.
   EXPECT_THAT(
@@ -1597,6 +1652,7 @@ TEST_F(AlgebraicSimplifierTest, ReshapeReplacedWithBitcast) {
 // Regression test for a bug where if we failed to sink a reshape, we'd set the
 // 'changed' bit in AlgebraicSimplifier to false.
 TEST_F(AlgebraicSimplifierTest, FailureToSinkReshapeDoesntAffectChangedBit) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
 
   // This add (param0 + 0) can be simplified.
@@ -1613,13 +1669,14 @@ TEST_F(AlgebraicSimplifierTest, FailureToSinkReshapeDoesntAffectChangedBit) {
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  bitcasting_callback());
-  module().AddEntryComputation(builder.Build());
-  EXPECT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  m->AddEntryComputation(builder.Build());
+  EXPECT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 }
 
 // Regression test for a bug where if we failed to sink a reshape, we'd set the
 // 'changed' bit in AlgebraicSimplifier to false.
 TEST_F(AlgebraicSimplifierTest, FailureToSinkBroadcastDoesntAffectChangedBit) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
 
   // This add (param0 + 0) can be simplified.
@@ -1637,11 +1694,12 @@ TEST_F(AlgebraicSimplifierTest, FailureToSinkBroadcastDoesntAffectChangedBit) {
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  bitcasting_callback());
-  module().AddEntryComputation(builder.Build());
-  EXPECT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  m->AddEntryComputation(builder.Build());
+  EXPECT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 }
 
 TEST_F(AlgebraicSimplifierTest, TransposeEqualsBitcast1) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   HloInstruction* param =
       builder.AddInstruction(HloInstruction::CreateParameter(
@@ -1655,19 +1713,20 @@ TEST_F(AlgebraicSimplifierTest, TransposeEqualsBitcast1) {
   *transpose->mutable_shape()->mutable_layout() =
       LayoutUtil::MakeLayout({0, 1, 2, 3});
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(), op::Transpose(param));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/true,
                                  bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   // Verify that the reshape is replaced.
   EXPECT_THAT(computation->root_instruction(), op::Bitcast(param));
 }
 
 TEST_F(AlgebraicSimplifierTest, TransposeEqualsBitcast2) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   HloInstruction* param =
       builder.AddInstruction(HloInstruction::CreateParameter(
@@ -1681,19 +1740,20 @@ TEST_F(AlgebraicSimplifierTest, TransposeEqualsBitcast2) {
   *transpose->mutable_shape()->mutable_layout() =
       LayoutUtil::MakeLayout({3, 1, 2, 0});
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(), op::Transpose(param));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/true,
                                  bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   // Verify that the reshape is replaced.
   EXPECT_THAT(computation->root_instruction(), op::Bitcast(param));
 }
 
 TEST_F(AlgebraicSimplifierTest, ReshapesMerged) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   HloInstruction* param0 =
       builder.AddInstruction(HloInstruction::CreateParameter(
@@ -1706,19 +1766,20 @@ TEST_F(AlgebraicSimplifierTest, ReshapesMerged) {
   builder.AddInstruction(HloInstruction::CreateReshape(
       ShapeUtil::MakeShape(F32, {1, 2, 1, 1, 2, 1}), reshape1));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Reshape(op::Reshape(param0)));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(), op::Reshape(param0));
 }
 
 TEST_F(AlgebraicSimplifierTest, CopiesMerged) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   HloInstruction* param0 =
       builder.AddInstruction(HloInstruction::CreateParameter(
@@ -1733,18 +1794,19 @@ TEST_F(AlgebraicSimplifierTest, CopiesMerged) {
       ShapeUtil::MakeShapeWithLayout(F32, {2, 2, 2}, {0, 2, 1}),
       HloOpcode::kCopy, copy1));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(), op::Copy(op::Copy(param0)));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/true,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(), op::Copy(param0));
 }
 
 TEST_F(AlgebraicSimplifierTest, TransposesMerged) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   HloInstruction* param0 =
       builder.AddInstruction(HloInstruction::CreateParameter(
@@ -1757,13 +1819,13 @@ TEST_F(AlgebraicSimplifierTest, TransposesMerged) {
   builder.AddInstruction(HloInstruction::CreateTranspose(
       ShapeUtil::MakeShape(F32, {4, 3, 2}), transpose1, {1, 0, 2}));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(), op::Transpose(transpose1));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(), op::Transpose(param0));
   EXPECT_EQ(std::vector<int64>({2, 1, 0}),
@@ -1772,6 +1834,7 @@ TEST_F(AlgebraicSimplifierTest, TransposesMerged) {
 
 // Test merging reshape and broadcast.
 TEST_F(AlgebraicSimplifierTest, ReshapeAndBroadcastMerged) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   auto param0 = builder.AddInstruction(HloInstruction::CreateParameter(
       0, ShapeUtil::MakeShape(F32, {5}), "param0"));
@@ -1780,20 +1843,21 @@ TEST_F(AlgebraicSimplifierTest, ReshapeAndBroadcastMerged) {
   builder.AddInstruction(HloInstruction::CreateBroadcast(
       ShapeUtil::MakeShape(F32, {1, 2, 3, 5, 1}), reshape1, {0, 3, 2}));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Broadcast(op::Reshape(param0)));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(), op::Broadcast(param0));
 }
 
 // Test merging broadcast and reshape.
 TEST_F(AlgebraicSimplifierTest, BroadcastAndReshapeMerged) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   auto param0 = builder.AddInstruction(HloInstruction::CreateParameter(
       0, ShapeUtil::MakeShape(F32, {2, 3}), "param0"));
@@ -1802,19 +1866,20 @@ TEST_F(AlgebraicSimplifierTest, BroadcastAndReshapeMerged) {
   builder.AddInstruction(HloInstruction::CreateReshape(
       ShapeUtil::MakeShape(F32, {2, 3, 7, 2, 1, 3, 2}), broadcast1));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Reshape(op::Broadcast(param0)));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(), op::Broadcast(param0));
 }
 
 TEST_F(AlgebraicSimplifierTest, BroadcastAndReshape_1_3x1_3) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   auto param = builder.AddInstruction(HloInstruction::CreateParameter(
       0, ShapeUtil::MakeShape(F32, {1}), "param"));
@@ -1823,20 +1888,21 @@ TEST_F(AlgebraicSimplifierTest, BroadcastAndReshape_1_3x1_3) {
   builder.AddInstruction(
       HloInstruction::CreateReshape(ShapeUtil::MakeShape(F32, {3}), broadcast));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Reshape(op::Broadcast(param)));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  EXPECT_FALSE(simplifier.Run(&module()).ValueOrDie());
+  EXPECT_FALSE(simplifier.Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Reshape(op::Broadcast(param)));
 }
 
 TEST_F(AlgebraicSimplifierTest, BroadcastAndReshape_4_3x2x4_6x1x1x4) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   auto param = builder.AddInstruction(HloInstruction::CreateParameter(
       0, ShapeUtil::MakeShape(F32, {4}), "param"));
@@ -1845,14 +1911,14 @@ TEST_F(AlgebraicSimplifierTest, BroadcastAndReshape_4_3x2x4_6x1x1x4) {
   builder.AddInstruction(HloInstruction::CreateReshape(
       ShapeUtil::MakeShape(F32, {6, 1, 1, 4}), broadcast));
 
-  HloComputation* computation = module().AddEntryComputation(builder.Build());
+  HloComputation* computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Reshape(op::Broadcast(param)));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(), op::Broadcast(param));
   EXPECT_THAT(computation->root_instruction()->dimensions(),
@@ -1860,6 +1926,7 @@ TEST_F(AlgebraicSimplifierTest, BroadcastAndReshape_4_3x2x4_6x1x1x4) {
 }
 
 TEST_F(AlgebraicSimplifierTest, BroadcastAndReshape_1_3x2x1_6x1x1x1) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   auto param = builder.AddInstruction(HloInstruction::CreateParameter(
       0, ShapeUtil::MakeShape(F32, {1}), "param"));
@@ -1868,14 +1935,14 @@ TEST_F(AlgebraicSimplifierTest, BroadcastAndReshape_1_3x2x1_6x1x1x1) {
   builder.AddInstruction(HloInstruction::CreateReshape(
       ShapeUtil::MakeShape(F32, {6, 1, 1, 1}), broadcast));
 
-  HloComputation* computation = module().AddEntryComputation(builder.Build());
+  HloComputation* computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Reshape(op::Broadcast(param)));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(), op::Broadcast(param));
   const std::vector<int64> broadcast_dims =
@@ -1885,6 +1952,7 @@ TEST_F(AlgebraicSimplifierTest, BroadcastAndReshape_1_3x2x1_6x1x1x1) {
 }
 
 TEST_F(AlgebraicSimplifierTest, BroadcastAndReshape_4_3x2x4x2_6x8) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   auto param = builder.AddInstruction(HloInstruction::CreateParameter(
       0, ShapeUtil::MakeShape(F32, {4}), "param"));
@@ -1893,33 +1961,34 @@ TEST_F(AlgebraicSimplifierTest, BroadcastAndReshape_4_3x2x4x2_6x8) {
   builder.AddInstruction(HloInstruction::CreateReshape(
       ShapeUtil::MakeShape(F32, {6, 8}), broadcast));
 
-  HloComputation* computation = module().AddEntryComputation(builder.Build());
+  HloComputation* computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Reshape(op::Broadcast(param)));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  EXPECT_FALSE(simplifier.Run(&module()).ValueOrDie());
+  EXPECT_FALSE(simplifier.Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Reshape(op::Broadcast(param)));
 }
 
 TEST_F(AlgebraicSimplifierTest, IotaAndReshapeMerged) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   auto iota = builder.AddInstruction(HloInstruction::CreateIota(
       ShapeUtil::MakeShape(F32, {1, 2, 3, 7, 12, 1}), 2));
   Shape result_shape = ShapeUtil::MakeShape(F32, {2, 3, 7, 2, 1, 3, 2});
   builder.AddInstruction(HloInstruction::CreateReshape(result_shape, iota));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(), op::Reshape(op::Iota()));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(), op::Iota());
   EXPECT_TRUE(
@@ -1927,18 +1996,19 @@ TEST_F(AlgebraicSimplifierTest, IotaAndReshapeMerged) {
 }
 
 TEST_F(AlgebraicSimplifierTest, IotaEffectiveScalar) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   auto iota = builder.AddInstruction(
       HloInstruction::CreateIota(ShapeUtil::MakeShape(F32, {1, 1}), 0));
   auto result_shape = iota->shape();
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(), op::Iota());
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   auto root = computation->root_instruction();
   EXPECT_THAT(root, op::Broadcast(op::Constant()));
@@ -1948,37 +2018,39 @@ TEST_F(AlgebraicSimplifierTest, IotaEffectiveScalar) {
 }
 
 TEST_F(AlgebraicSimplifierTest, IotaAndReshape_1_3x2_6) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   auto iota = builder.AddInstruction(
       HloInstruction::CreateIota(ShapeUtil::MakeShape(F32, {3, 2}), 1));
   builder.AddInstruction(
       HloInstruction::CreateReshape(ShapeUtil::MakeShape(F32, {6}), iota));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(), op::Reshape(op::Iota()));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  EXPECT_FALSE(simplifier.Run(&module()).ValueOrDie());
+  EXPECT_FALSE(simplifier.Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(), op::Reshape(op::Iota()));
 }
 
 TEST_F(AlgebraicSimplifierTest, IotaAndReshape_4_3x2x4_6x1x1x4) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   auto iota = builder.AddInstruction(
       HloInstruction::CreateIota(ShapeUtil::MakeShape(F32, {3, 2, 4}), 2));
   builder.AddInstruction(HloInstruction::CreateReshape(
       ShapeUtil::MakeShape(F32, {6, 1, 1, 4}), iota));
 
-  HloComputation* computation = module().AddEntryComputation(builder.Build());
+  HloComputation* computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(), op::Reshape(op::Iota()));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(), op::Iota());
   EXPECT_EQ(Cast<HloIotaInstruction>(computation->root_instruction())
@@ -1987,19 +2059,20 @@ TEST_F(AlgebraicSimplifierTest, IotaAndReshape_4_3x2x4_6x1x1x4) {
 }
 
 TEST_F(AlgebraicSimplifierTest, IotaAndReshape_1_3x2x2_6x1x1x2) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   auto iota = builder.AddInstruction(
       HloInstruction::CreateIota(ShapeUtil::MakeShape(F32, {3, 2, 2}), 2));
   builder.AddInstruction(HloInstruction::CreateReshape(
       ShapeUtil::MakeShape(F32, {6, 1, 1, 2}), iota));
 
-  HloComputation* computation = module().AddEntryComputation(builder.Build());
+  HloComputation* computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(), op::Reshape(op::Iota()));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(), op::Iota());
   const int64 iota_dim =
@@ -2009,19 +2082,20 @@ TEST_F(AlgebraicSimplifierTest, IotaAndReshape_1_3x2x2_6x1x1x2) {
 }
 
 TEST_F(AlgebraicSimplifierTest, IotaAndReshape_4_3x2x4x2_6x8) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   auto iota = builder.AddInstruction(
       HloInstruction::CreateIota(ShapeUtil::MakeShape(F32, {3, 2, 4, 2}), 2));
   builder.AddInstruction(
       HloInstruction::CreateReshape(ShapeUtil::MakeShape(F32, {6, 8}), iota));
 
-  HloComputation* computation = module().AddEntryComputation(builder.Build());
+  HloComputation* computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(), op::Reshape(op::Iota()));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  EXPECT_FALSE(simplifier.Run(&module()).ValueOrDie());
+  EXPECT_FALSE(simplifier.Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(), op::Reshape(op::Iota()));
 }
@@ -2043,14 +2117,14 @@ TEST_F(AlgebraicSimplifierTest, RemoveNoopPad) {
   builder.AddInstruction(HloInstruction::CreatePad(
       ShapeUtil::MakeShape(F32, {2, 2}), param, zero, no_padding));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation* computation = module->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(), op::Pad(param, zero));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(), param);
 }
@@ -2076,7 +2150,7 @@ TEST_F(AlgebraicSimplifierTest, NegativePadding) {
   HloInstruction* pad = builder.AddInstruction(HloInstruction::CreatePad(
       ShapeUtil::MakeShape(F32, {11, 5}), param, zero, padding));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation* computation = module->AddEntryComputation(builder.Build());
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
@@ -2095,7 +2169,7 @@ TEST_F(AlgebraicSimplifierTest, NegativePadding) {
   EXPECT_THAT(computation->root_instruction(), op::Pad(param, zero));
   EXPECT_TRUE(has_negative_padding(pad));
 
-  ASSERT_TRUE(simplifier.Run(module).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(), op::Slice(op::Pad(param, zero)));
   EXPECT_FALSE(
@@ -2110,14 +2184,14 @@ TEST_F(AlgebraicSimplifierTest, RemoveNoopReshape) {
   builder.AddInstruction(
       HloInstruction::CreateReshape(ShapeUtil::MakeShape(F32, {2, 3}), param));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation* computation = module->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(), op::Reshape(param));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(), param);
 }
@@ -2133,14 +2207,14 @@ TEST_F(AlgebraicSimplifierTest, RemoveNoopSlice) {
       ShapeUtil::MakeShape(F32, {dim0, dim1}), param, /*start_indices=*/{0, 0},
       /*limit_indices=*/{dim0, dim1}, /*strides=*/{1, 1}));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation* computation = module->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(), op::Slice(param));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(), param);
 }
@@ -2162,14 +2236,14 @@ TEST_F(AlgebraicSimplifierTest, SliceOfSliceToSlice) {
       ShapeUtil::MakeShape(F32, {dim0 - 5, dim1 - 9}), original_slice,
       /*start_indices=*/{2, 3},
       /*limit_indices=*/{dim0 - 3, dim1 - 6}, /*strides=*/{1, 1}));
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation* computation = module->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(), op::Slice(op::Slice(param)));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(), op::Slice(param));
   EXPECT_EQ(computation->root_instruction()->slice_starts(0), 3);
@@ -2178,6 +2252,57 @@ TEST_F(AlgebraicSimplifierTest, SliceOfSliceToSlice) {
   EXPECT_EQ(computation->root_instruction()->slice_limits(1), dim1 - 4);
 }
 
+TEST_F(AlgebraicSimplifierTest, SliceOfReshapeToReshapeOfSlice) {
+  HloComputation::Builder builder(TestName());
+  const int64 dim0 = 11;
+  const int64 dim1 = 12;
+  const int64 dim2 = 13;
+  HloInstruction* param =
+      builder.AddInstruction(HloInstruction::CreateParameter(
+          0, ShapeUtil::MakeShape(F32, {dim0 * dim1, dim2}), "param"));
+  HloInstruction* original_reshape =
+      builder.AddInstruction(HloInstruction::CreateReshape(
+          ShapeUtil::MakeShape(F32, {dim0, dim1, dim2}), param));
+
+  builder.AddInstruction(HloInstruction::CreateSlice(
+      ShapeUtil::MakeShape(F32, {dim0 - 2, dim1, dim2}), original_reshape,
+      /*start_indices=*/{0, 0, 0},
+      /*limit_indices=*/{dim0 - 2, dim1, dim2}, /*strides=*/{1, 1, 1}));
+  auto module = CreateNewVerifiedModule();
+  HloComputation* computation = module->AddEntryComputation(builder.Build());
+
+  EXPECT_THAT(computation->root_instruction(), op::Slice(op::Reshape(param)));
+
+  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
+                                 non_bitcasting_callback());
+  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+
+  EXPECT_THAT(computation->root_instruction(), op::Reshape(op::Slice(param)));
+}
+
+TEST_F(AlgebraicSimplifierTest, SliceOfReshapeUnchanged) {
+  HloComputation::Builder builder(TestName());
+  HloInstruction* param =
+      builder.AddInstruction(HloInstruction::CreateParameter(
+          0, ShapeUtil::MakeShape(F32, {1, 144, 25, 1, 512}), "param"));
+  HloInstruction* original_reshape =
+      builder.AddInstruction(HloInstruction::CreateReshape(
+          ShapeUtil::MakeShape(F32, {3600, 512}), param));
+
+  builder.AddInstruction(HloInstruction::CreateSlice(
+      ShapeUtil::MakeShape(F32, {960, 512}), original_reshape,
+      /*start_indices=*/{0, 0},
+      /*limit_indices=*/{960, 512}, /*strides=*/{1, 1}));
+  auto module = CreateNewVerifiedModule();
+  HloComputation* computation = module->AddEntryComputation(builder.Build());
+
+  EXPECT_THAT(computation->root_instruction(), op::Slice(op::Reshape(param)));
+
+  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
+                                 non_bitcasting_callback());
+  ASSERT_FALSE(simplifier.Run(module.get()).ValueOrDie());
+}
+
 TEST_F(AlgebraicSimplifierTest, RemoveNoopSort) {
   auto builder = HloComputation::Builder(TestName());
 
@@ -2185,11 +2310,11 @@ TEST_F(AlgebraicSimplifierTest, RemoveNoopSort) {
   auto keys = builder.AddInstruction(
       HloInstruction::CreateParameter(0, keys_shape, "keys"));
   builder.AddInstruction(HloInstruction::CreateSort(keys_shape, 0, keys));
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation* computation = module->AddEntryComputation(builder.Build());
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
   EXPECT_THAT(computation->root_instruction(), keys);
 }
 
@@ -2207,15 +2332,191 @@ TEST_F(AlgebraicSimplifierTest, ReplaceEffectiveScalarKeyValueSortWithTuple) {
   builder.AddInstruction(HloInstruction::CreateSort(
       ShapeUtil::MakeTupleShape({keys_shape, values_shape, values_shape}), 0,
       keys, {values0, values1}));
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation* computation = module->AddEntryComputation(builder.Build());
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
   EXPECT_THAT(computation->root_instruction(),
               op::Tuple(keys, values0, values1));
 }
 
+// Test that A && True is simplified to A
+TEST_F(AlgebraicSimplifierTest, AndTrue) {
+  auto m = CreateNewVerifiedModule();
+  Shape r0pred = ShapeUtil::MakeShape(PRED, {});
+  HloComputation::Builder builder(TestName());
+  HloInstruction* param0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, r0pred, "param0"));
+  HloInstruction* const_true = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<bool>(true)));
+  builder.AddInstruction(HloInstruction::CreateBinary(r0pred, HloOpcode::kAnd,
+                                                      param0, const_true));
+
+  auto computation = m->AddEntryComputation(builder.Build());
+  HloInstruction* root = computation->root_instruction();
+  EXPECT_EQ(root->opcode(), HloOpcode::kAnd);
+  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
+                                 non_bitcasting_callback());
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
+  root = computation->root_instruction();
+  EXPECT_EQ(root, param0);
+}
+
+// Test that True && A is simplified to A
+TEST_F(AlgebraicSimplifierTest, AndTrue2) {
+  auto m = CreateNewVerifiedModule();
+  Shape r0pred = ShapeUtil::MakeShape(PRED, {});
+  HloComputation::Builder builder(TestName());
+  HloInstruction* param0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, r0pred, "param0"));
+  HloInstruction* const_true = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<bool>(true)));
+  builder.AddInstruction(HloInstruction::CreateBinary(r0pred, HloOpcode::kAnd,
+                                                      const_true, param0));
+
+  auto computation = m->AddEntryComputation(builder.Build());
+  HloInstruction* root = computation->root_instruction();
+  EXPECT_EQ(root->opcode(), HloOpcode::kAnd);
+  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
+                                 non_bitcasting_callback());
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
+  root = computation->root_instruction();
+  EXPECT_EQ(root, param0);
+}
+
+// Test that A && False is simplified to False
+TEST_F(AlgebraicSimplifierTest, AndFalse) {
+  auto m = CreateNewVerifiedModule();
+  Shape r0pred = ShapeUtil::MakeShape(PRED, {});
+  HloComputation::Builder builder(TestName());
+  HloInstruction* param0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, r0pred, "param0"));
+  HloInstruction* const_false = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<bool>(false)));
+  builder.AddInstruction(HloInstruction::CreateBinary(r0pred, HloOpcode::kAnd,
+                                                      param0, const_false));
+
+  auto computation = m->AddEntryComputation(builder.Build());
+  HloInstruction* root = computation->root_instruction();
+  EXPECT_EQ(root->opcode(), HloOpcode::kAnd);
+  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
+                                 non_bitcasting_callback());
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
+  root = computation->root_instruction();
+  EXPECT_EQ(root, const_false);
+}
+
+// Test that False && A is simplified to False
+TEST_F(AlgebraicSimplifierTest, AndFalse2) {
+  auto m = CreateNewVerifiedModule();
+  Shape r0pred = ShapeUtil::MakeShape(PRED, {});
+  HloComputation::Builder builder(TestName());
+  HloInstruction* param0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, r0pred, "param0"));
+  HloInstruction* const_false = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<bool>(false)));
+  builder.AddInstruction(HloInstruction::CreateBinary(r0pred, HloOpcode::kAnd,
+                                                      const_false, param0));
+
+  auto computation = m->AddEntryComputation(builder.Build());
+  HloInstruction* root = computation->root_instruction();
+  EXPECT_EQ(root->opcode(), HloOpcode::kAnd);
+  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
+                                 non_bitcasting_callback());
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
+  root = computation->root_instruction();
+  EXPECT_EQ(root, const_false);
+}
+
+// Test that A || True is simplified to True
+TEST_F(AlgebraicSimplifierTest, OrTrue) {
+  auto m = CreateNewVerifiedModule();
+  Shape r0pred = ShapeUtil::MakeShape(PRED, {});
+  HloComputation::Builder builder(TestName());
+  HloInstruction* param0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, r0pred, "param0"));
+  HloInstruction* const_true = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<bool>(true)));
+  builder.AddInstruction(
+      HloInstruction::CreateBinary(r0pred, HloOpcode::kOr, param0, const_true));
+
+  auto computation = m->AddEntryComputation(builder.Build());
+  HloInstruction* root = computation->root_instruction();
+  EXPECT_EQ(root->opcode(), HloOpcode::kOr);
+  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
+                                 non_bitcasting_callback());
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
+  root = computation->root_instruction();
+  EXPECT_EQ(root, const_true);
+}
+
+// Test that True || A is simplified to True
+TEST_F(AlgebraicSimplifierTest, OrTrue2) {
+  auto m = CreateNewVerifiedModule();
+  Shape r0pred = ShapeUtil::MakeShape(PRED, {});
+  HloComputation::Builder builder(TestName());
+  HloInstruction* param0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, r0pred, "param0"));
+  HloInstruction* const_true = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<bool>(true)));
+  builder.AddInstruction(
+      HloInstruction::CreateBinary(r0pred, HloOpcode::kOr, const_true, param0));
+
+  auto computation = m->AddEntryComputation(builder.Build());
+  HloInstruction* root = computation->root_instruction();
+  EXPECT_EQ(root->opcode(), HloOpcode::kOr);
+  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
+                                 non_bitcasting_callback());
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
+  root = computation->root_instruction();
+  EXPECT_EQ(root, const_true);
+}
+
+// Test that A || False is simplified to A
+TEST_F(AlgebraicSimplifierTest, OrFalse) {
+  auto m = CreateNewVerifiedModule();
+  Shape r0pred = ShapeUtil::MakeShape(PRED, {});
+  HloComputation::Builder builder(TestName());
+  HloInstruction* param0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, r0pred, "param0"));
+  HloInstruction* const_false = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<bool>(false)));
+  builder.AddInstruction(HloInstruction::CreateBinary(r0pred, HloOpcode::kOr,
+                                                      param0, const_false));
+
+  auto computation = m->AddEntryComputation(builder.Build());
+  HloInstruction* root = computation->root_instruction();
+  EXPECT_EQ(root->opcode(), HloOpcode::kOr);
+  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
+                                 non_bitcasting_callback());
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
+  root = computation->root_instruction();
+  EXPECT_EQ(root, param0);
+}
+
+// Test that False || A is simplified to A
+TEST_F(AlgebraicSimplifierTest, OrFalse2) {
+  auto m = CreateNewVerifiedModule();
+  Shape r0pred = ShapeUtil::MakeShape(PRED, {});
+  HloComputation::Builder builder(TestName());
+  HloInstruction* param0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, r0pred, "param0"));
+  HloInstruction* const_false = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<bool>(false)));
+  builder.AddInstruction(HloInstruction::CreateBinary(r0pred, HloOpcode::kOr,
+                                                      const_false, param0));
+
+  auto computation = m->AddEntryComputation(builder.Build());
+  HloInstruction* root = computation->root_instruction();
+  EXPECT_EQ(root->opcode(), HloOpcode::kOr);
+  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
+                                 non_bitcasting_callback());
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
+  root = computation->root_instruction();
+  EXPECT_EQ(root, param0);
+}
+
 // Used for TEST_Ps that test merging (or not) of a kPad instruction into a
 // convolution's Window.
 struct ConvPaddingTestcase {
@@ -2337,15 +2638,15 @@ TEST_P(ConvInputPaddingTest, DoTest) {
           .ValueOrDie(),
       lhs_pad, filter, /*feature_group_count=*/1, window, dnums,
       DefaultPrecisionConfig(2)));
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
   if (testcase.expected_conv_window.empty()) {
-    ASSERT_FALSE(simplifier.Run(module).ValueOrDie());
+    ASSERT_FALSE(simplifier.Run(module.get()).ValueOrDie());
   } else {
-    ASSERT_TRUE(simplifier.Run(module).ValueOrDie());
+    ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
     auto* conv = module->entry_computation()->root_instruction();
     SCOPED_TRACE(module->ToString());
     ASSERT_THAT(conv, op::Convolution(op::Parameter(), op::Parameter()));
@@ -2455,15 +2756,15 @@ TEST_P(ConvFilterPaddingTest, DoIt) {
       input, rhs_pad, /*feature_group_count=*/1, window, dnums,
       precision_config));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
   if (testcase.expected_conv_window.empty()) {
-    ASSERT_FALSE(simplifier.Run(module).ValueOrDie());
+    ASSERT_FALSE(simplifier.Run(module.get()).ValueOrDie());
   } else {
-    ASSERT_TRUE(simplifier.Run(module).ValueOrDie());
+    ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
     auto* conv = module->entry_computation()->root_instruction();
     SCOPED_TRACE(module->ToString());
     ASSERT_THAT(conv, op::Convolution(op::Parameter(), op::Parameter()));
@@ -2604,7 +2905,7 @@ TEST_F(AlgebraicSimplifierTest, ConvertConvToMatmul) {
         /*feature_group_count=*/1, window, dnums, DefaultPrecisionConfig(2)));
 
     // TODO(b/80488902): verify this module.
-    auto module = HloTestBase::CreateNewModule();
+    auto module = CreateNewUnverifiedModule();
     auto* computation = module->AddEntryComputation(b.Build());
 
     AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/true,
@@ -2724,7 +3025,7 @@ TEST_F(AlgebraicSimplifierTest, ScalarBroadcastToSlice) {
   HloInstruction* slice = builder.AddInstruction(HloInstruction::CreateSlice(
       slice_shape, broadcast, {0, 1, 2, 3}, {2, 3, 5, 6}, {1, 1, 1, 1}));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   HloInstruction* root = computation->root_instruction();
@@ -2734,10 +3035,10 @@ TEST_F(AlgebraicSimplifierTest, ScalarBroadcastToSlice) {
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
 
-  ASSERT_TRUE(simplifier.Run(module).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
 
   // Running simplification again should not result in any further changes.
-  ASSERT_FALSE(simplifier.Run(module).ValueOrDie());
+  ASSERT_FALSE(simplifier.Run(module.get()).ValueOrDie());
 
   root = computation->root_instruction();
   EXPECT_THAT(root, op::Broadcast(scalar_param));
@@ -2763,7 +3064,7 @@ TEST_F(AlgebraicSimplifierTest, ScalarBroadcastToTransposeReshape) {
   HloInstruction* reshape = builder.AddInstruction(
       HloInstruction::CreateReshape(reshape_shape, transpose));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   HloInstruction* root = computation->root_instruction();
@@ -2772,7 +3073,7 @@ TEST_F(AlgebraicSimplifierTest, ScalarBroadcastToTransposeReshape) {
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
 
   root = computation->root_instruction();
   EXPECT_THAT(root, op::Broadcast(forty_two));
@@ -2782,7 +3083,7 @@ TEST_F(AlgebraicSimplifierTest, ScalarBroadcastToTransposeReshape) {
 // Test that ReduceWindow(Pad(op, x), y) can simplify to ReduceWindow(op, x).
 TEST_F(AlgebraicSimplifierTest, FoldPadIntoReduceWindow) {
   // TODO(b/80488902): verify this module.
-  auto module = HloTestBase::CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   HloComputation::Builder builder(TestName());
 
   // Create operand to the pad.
@@ -2864,7 +3165,7 @@ TEST_F(AlgebraicSimplifierTest, FoldPadIntoReduceWindow) {
 // ReduceWindow(Convert(op), x).
 TEST_F(AlgebraicSimplifierTest, FoldConvertedPadIntoReduceWindow) {
   // TODO(b/80488902): verify this module.
-  auto module = HloTestBase::CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   HloComputation::Builder builder(TestName());
 
   // Create operand to the pad.
@@ -2954,12 +3255,12 @@ TEST_F(AlgebraicSimplifierTest, ReversalOfTrivialDimensionsToBitcast) {
   builder.AddInstruction(
       HloInstruction::CreateReverse(shape, a, /*dimensions=*/{2, 3}));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
 
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(a, root);
@@ -2970,6 +3271,7 @@ TEST_F(AlgebraicSimplifierTest, IteratorInvalidation) {
   // Dots add computations to the parent module. Test that, when the HloModule's
   // computations are updated, then iterator invalidation doesn't occur
   // when running on subsequent computations.
+  auto m = CreateNewVerifiedModule();
   Shape r1f32 = ShapeUtil::MakeShape(F32, {1});
   HloComputation::Builder builder(TestName() + ".Dot");
   HloInstruction* x =
@@ -2991,15 +3293,16 @@ TEST_F(AlgebraicSimplifierTest, IteratorInvalidation) {
   call_builder.AddInstruction(
       HloInstruction::CreateCall(r1f32, {zero, one}, dot_computation.get()));
 
-  module().AddEmbeddedComputation(std::move(dot_computation));
-  module().AddEntryComputation(call_builder.Build());
+  m->AddEmbeddedComputation(std::move(dot_computation));
+  m->AddEntryComputation(call_builder.Build());
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 }
 
 // Test that a constant with tuple shape becomes a tuple of constants.
 TEST_F(AlgebraicSimplifierTest, ConstantTupleBecomesTupleOfConstants) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   const float constant_scalar = 7.3f;
   std::initializer_list<float> constant_vector = {1.1f, 2.0f, 3.3f};
@@ -3008,11 +3311,11 @@ TEST_F(AlgebraicSimplifierTest, ConstantTupleBecomesTupleOfConstants) {
   Literal value = LiteralUtil::MakeTuple({&elements[0], &elements[1]});
   builder.AddInstruction(HloInstruction::CreateConstant(std::move(value)));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   EXPECT_THAT(computation->root_instruction(),
               op::Tuple(op::Constant(), op::Constant()));
 }
@@ -3021,6 +3324,7 @@ TEST_F(AlgebraicSimplifierTest, ConstantTupleBecomesTupleOfConstants) {
 // of its input equals the size of its output.  In this case, the dynamic slice
 // is equal to its input.
 TEST_F(AlgebraicSimplifierTest, TrivialDynamicSlice) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
 
   Shape shape = ShapeUtil::MakeShape(F32, {10, 100, 1000});
@@ -3032,10 +3336,10 @@ TEST_F(AlgebraicSimplifierTest, TrivialDynamicSlice) {
           1, ShapeUtil::MakeShape(U32, {3}), "slice_indices")),
       /*slice_sizes=*/{10, 100, 1000}));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   EXPECT_THAT(computation->root_instruction(), op::Parameter());
 }
 
@@ -3043,6 +3347,7 @@ TEST_F(AlgebraicSimplifierTest, TrivialDynamicSlice) {
 // size of its "update" equals the size of its output.  In this case, the
 // dynamic-update-slice is equal to its update.
 TEST_F(AlgebraicSimplifierTest, TrivialDynamicUpdateSlice) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
 
   Shape full_shape = ShapeUtil::MakeShape(F32, {10, 100, 1000});
@@ -3065,16 +3370,17 @@ TEST_F(AlgebraicSimplifierTest, TrivialDynamicUpdateSlice) {
       builder.AddInstruction(HloInstruction::CreateParameter(
           3, ShapeUtil::MakeShape(U32, {3}), "update_indices"))));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   EXPECT_THAT(computation->root_instruction(),
               op::DynamicSlice(op::Parameter(), op::Parameter()));
 }
 
 // Test that two consecutive broadcasts can be merged to one.
 TEST_F(AlgebraicSimplifierTest, MergeBroadcasts) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   Shape r2f32 = ShapeUtil::MakeShape(F32, {2, 2});
   HloInstruction* input_array = builder.AddInstruction(
@@ -3085,12 +3391,12 @@ TEST_F(AlgebraicSimplifierTest, MergeBroadcasts) {
   builder.AddInstruction(
       HloInstruction::CreateBroadcast(r3f32, inner_bcast, {0, 2}));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kBroadcast);
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   root = computation->root_instruction();
   EXPECT_THAT(root, op::Broadcast(op::Constant()));
   EXPECT_THAT(root->dimensions(), ElementsAre(2));
@@ -3098,6 +3404,7 @@ TEST_F(AlgebraicSimplifierTest, MergeBroadcasts) {
 
 // Test that two consecutive broadcasts can be merged to one.
 TEST_F(AlgebraicSimplifierTest, MergeBroadcasts2) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   Shape r2f32 = ShapeUtil::MakeShape(F32, {2, 3});
   Shape r3f32 = ShapeUtil::MakeShape(F32, {2, 5, 3});
@@ -3111,12 +3418,12 @@ TEST_F(AlgebraicSimplifierTest, MergeBroadcasts2) {
   builder.AddInstruction(
       HloInstruction::CreateBroadcast(r4f32, inner_bcast, {1, 2, 3}));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kBroadcast);
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   root = computation->root_instruction();
   EXPECT_THAT(root, op::Broadcast(op::Parameter(0)));
   EXPECT_THAT(root->dimensions(), ElementsAre(1, 3));
@@ -3124,6 +3431,7 @@ TEST_F(AlgebraicSimplifierTest, MergeBroadcasts2) {
 
 // Test that a broadcast of an iota can be merged to one iota.
 TEST_F(AlgebraicSimplifierTest, MergeBroadcastAndIota) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   Shape r2f32 = ShapeUtil::MakeShape(F32, {2, 2});
   HloInstruction* iota =
@@ -3131,12 +3439,12 @@ TEST_F(AlgebraicSimplifierTest, MergeBroadcastAndIota) {
   Shape r3f32 = ShapeUtil::MakeShape(F32, {2, 2, 2});
   builder.AddInstruction(HloInstruction::CreateBroadcast(r3f32, iota, {0, 2}));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kBroadcast);
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   root = computation->root_instruction();
   EXPECT_THAT(root, op::Iota());
   EXPECT_EQ(Cast<HloIotaInstruction>(root)->iota_dimension(), 2);
@@ -3144,6 +3452,7 @@ TEST_F(AlgebraicSimplifierTest, MergeBroadcastAndIota) {
 
 // Test that a broadcast of an iota can be merged to one iota.
 TEST_F(AlgebraicSimplifierTest, MergeBroadcastAndIota2) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   Shape r3f32 = ShapeUtil::MakeShape(F32, {2, 5, 3});
   HloInstruction* iota =
@@ -3152,12 +3461,12 @@ TEST_F(AlgebraicSimplifierTest, MergeBroadcastAndIota2) {
   builder.AddInstruction(
       HloInstruction::CreateBroadcast(r4f32, iota, {1, 2, 3}));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kBroadcast);
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   root = computation->root_instruction();
   EXPECT_THAT(root, op::Iota());
   EXPECT_EQ(Cast<HloIotaInstruction>(root)->iota_dimension(), 2);
@@ -3174,9 +3483,8 @@ TEST_F(AlgebraicSimplifierTest, SliceOfPadLow) {
       ROOT slice = f32[1,1] slice(f32[8,10] pad), slice={[2:3],[0:1]}
     }
   )";
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto module,
-      HloRunner::CreateModuleFromString(hlo_string, GetDebugOptionsForTest()));
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  bitcasting_callback());
@@ -3196,9 +3504,8 @@ TEST_F(AlgebraicSimplifierTest, SliceOfPadHigh) {
       ROOT slice = f32[1,1] slice(f32[8,10] pad), slice={[6:7],[9:10]}
     }
   )";
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto module,
-      HloRunner::CreateModuleFromString(hlo_string, GetDebugOptionsForTest()));
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  bitcasting_callback());
@@ -3218,9 +3525,8 @@ TEST_F(AlgebraicSimplifierTest, SliceOfPadMidNonScalar) {
       ROOT slice = f32[1,1] slice(f32[8,10] pad), slice={[5:6],[9:10]}
     }
   )";
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto module,
-      HloRunner::CreateModuleFromString(hlo_string, GetDebugOptionsForTest()));
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  bitcasting_callback());
@@ -3238,9 +3544,8 @@ TEST_F(AlgebraicSimplifierTest, SliceOfPadMidScalar) {
       ROOT slice = f32[1,1] slice(f32[8,10] pad), slice={[3:4],[4:5]}
     }
   )";
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto module,
-      HloRunner::CreateModuleFromString(hlo_string, GetDebugOptionsForTest()));
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  bitcasting_callback());
@@ -3249,6 +3554,92 @@ TEST_F(AlgebraicSimplifierTest, SliceOfPadMidScalar) {
   EXPECT_THAT(root, op::Parameter());
 }
 
+TEST_F(AlgebraicSimplifierTest, SliceOfConcatScalarInput) {
+  const char* hlo_string = R"(
+    HloModule module
+
+    ENTRY test {
+      param.0 = f32[2] parameter(0)
+      param.1 = f32[1] parameter(1)
+      param.2 = f32[3] parameter(2)
+      concat = f32[6] concatenate(param.0, param.1, param.2), dimensions={0}
+      ROOT slice = f32[1] slice(concat), slice={[2:3]}
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
+                                 bitcasting_callback());
+  EXPECT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, op::Parameter(1));
+}
+
+TEST_F(AlgebraicSimplifierTest, SliceOfConcatNonScalarInput) {
+  const char* hlo_string = R"(
+    HloModule module
+
+    ENTRY test {
+      param.0 = f32[2] parameter(0)
+      param.1 = f32[1] parameter(1)
+      param.2 = f32[3] parameter(2)
+      concat = f32[6] concatenate(param.0, param.1, param.2), dimensions={0}
+      ROOT slice = f32[1] slice(concat), slice={[4:5]}
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
+                                 bitcasting_callback());
+  EXPECT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, op::Slice(op::Parameter(2)));
+  EXPECT_EQ(root->slice_starts(0), 1);
+  EXPECT_EQ(root->slice_limits(0), 2);
+}
+
+TEST_F(AlgebraicSimplifierTest, NegateNegate) {
+  const char* hlo_string = R"(
+    HloModule module
+
+    ENTRY test {
+      param.0 = f32[2] parameter(0)
+      neg.0 = f32[2] negate(param.0)
+      ROOT neg.1 = f32[2] negate(neg.0)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
+                                 bitcasting_callback());
+  EXPECT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, op::Parameter(0));
+}
+
+TEST_F(AlgebraicSimplifierTest, NotNot) {
+  const char* hlo_string = R"(
+    HloModule module
+
+    ENTRY test {
+      param.0 = pred[2] parameter(0)
+      not.0 = pred[2] not(param.0)
+      ROOT not.1 = pred[2] not(not.0)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
+                                 bitcasting_callback());
+  EXPECT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, op::Parameter(0));
+}
+
 struct PadReduceWindowEffectiveBroadcastCase {
   std::vector<int64> input_spatials;
   std::vector<int64> symmetric_pad_spatials;
@@ -3278,6 +3669,7 @@ class PadReduceWindowEffectiveBroadcastTest
           PadReduceWindowEffectiveBroadcastCase> {};
 
 TEST_P(PadReduceWindowEffectiveBroadcastTest, DoIt) {
+  auto m = CreateNewVerifiedModule();
   const auto& param = GetParam();
 
   // a and b are parallel bounds we can either turn into a B F S0 S1 or
@@ -3326,7 +3718,7 @@ TEST_P(PadReduceWindowEffectiveBroadcastTest, DoIt) {
         HloInstruction::CreateParameter(1, scalar_shape, "p1"));
     builder.AddInstruction(
         HloInstruction::CreateBinary(scalar_shape, HloOpcode::kAdd, p0, p1));
-    add_computation = module().AddEmbeddedComputation(builder.Build());
+    add_computation = m->AddEmbeddedComputation(builder.Build());
   }
 
   Window window = window_util::MakeWindow(
@@ -3340,10 +3732,10 @@ TEST_P(PadReduceWindowEffectiveBroadcastTest, DoIt) {
   builder.AddInstruction(HloInstruction::CreateReduceWindow(
       output_shape, pad, zero, window, add_computation));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  TF_ASSERT_OK_AND_ASSIGN(bool run_successful, simplifier.Run(&module()));
+  TF_ASSERT_OK_AND_ASSIGN(bool run_successful, simplifier.Run(m.get()));
   ASSERT_TRUE(run_successful);
 
   EXPECT_TRUE(
@@ -3392,6 +3784,7 @@ class DotStrengthReductionTest
       public ::testing::WithParamInterface<
           ::testing::tuple<int, int, int, bool, bool, PrimitiveType>> {};
 TEST_P(DotStrengthReductionTest, DotStrengthReduction) {
+  auto module = CreateNewVerifiedModule();
   int m, k, n;
   bool transpose_lhs, transpose_rhs;
   PrimitiveType element_type;
@@ -3421,10 +3814,10 @@ TEST_P(DotStrengthReductionTest, DotStrengthReduction) {
   dot_dnums.add_rhs_contracting_dimensions(0);
   builder.AddInstruction(HloInstruction::CreateDot(
       dot_shape, lhs, rhs, dot_dnums, DefaultPrecisionConfig(2)));
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = module->AddEntryComputation(builder.Build());
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  TF_ASSERT_OK_AND_ASSIGN(bool changed, simplifier.Run(&module()));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, simplifier.Run(module.get()));
   const bool dot_should_be_transformed = m == 1 || k == 1 || n == 1;
   const bool computation_should_be_modified =
       dot_should_be_transformed || (transpose_lhs && transpose_rhs);
@@ -3452,7 +3845,7 @@ struct DotOfConcatTestSpec {
 };
 
 class DotOfConcatSimplificationTest
-    : public HloVerifiedTestBase,
+    : public HloTestBase,
       public ::testing::WithParamInterface<DotOfConcatTestSpec> {};
 
 // Test that we transform
@@ -3460,6 +3853,7 @@ class DotOfConcatSimplificationTest
 // to
 //  add(dot(const_0, A), dot(const_1, B),  dot(const_2, C))
 TEST_P(DotOfConcatSimplificationTest, ConstantLHS) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
 
   DotOfConcatTestSpec spec = GetParam();
@@ -3498,10 +3892,10 @@ TEST_P(DotOfConcatSimplificationTest, ConstantLHS) {
   builder.AddInstruction(HloInstruction::CreateDot(
       dot_shape, lhs, rhs, dot_dnums, DefaultPrecisionConfig(2)));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  TF_ASSERT_OK_AND_ASSIGN(bool run_successful, simplifier.Run(&module()));
+  TF_ASSERT_OK_AND_ASSIGN(bool run_successful, simplifier.Run(m.get()));
   ASSERT_TRUE(run_successful);
 
   EXPECT_TRUE(
@@ -3519,6 +3913,7 @@ TEST_P(DotOfConcatSimplificationTest, ConstantLHS) {
 // to
 //  add(dot(A, const_0), dot(B, const_1),  dot(C, const_2))
 TEST_P(DotOfConcatSimplificationTest, ConstantRHS) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
 
   DotOfConcatTestSpec spec = GetParam();
@@ -3562,10 +3957,10 @@ TEST_P(DotOfConcatSimplificationTest, ConstantRHS) {
   builder.AddInstruction(HloInstruction::CreateDot(
       dot_shape, lhs, rhs, dot_dnums, DefaultPrecisionConfig(2)));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  TF_ASSERT_OK_AND_ASSIGN(bool run_successful, simplifier.Run(&module()));
+  TF_ASSERT_OK_AND_ASSIGN(bool run_successful, simplifier.Run(m.get()));
   ASSERT_TRUE(run_successful);
   EXPECT_TRUE(
       ShapeUtil::Equal(computation->root_instruction()->shape(), dot_shape));
@@ -3590,6 +3985,7 @@ DotOfConcatTestSpec kDotOfConcatTestSpecs[] = {
 // Test that DynamicUpdateSlice update param with any dimension equal to zero
 // gets removed.
 TEST_F(AlgebraicSimplifierTest, DynamicUpdateSliceZeroUpdate) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   const Shape dslice_shape = ShapeUtil::MakeShape(F32, {10});
   HloInstruction* const operand = builder.AddInstruction(
@@ -3602,11 +3998,11 @@ TEST_F(AlgebraicSimplifierTest, DynamicUpdateSliceZeroUpdate) {
   builder.AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
       dslice_shape, operand, update, start_indices));
   const HloComputation* const computation =
-      module().AddEntryComputation(builder.Build());
+      m->AddEntryComputation(builder.Build());
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   EXPECT_THAT(computation->root_instruction(), operand);
 }
 
@@ -3625,7 +4021,7 @@ struct DotOfGatherTestSpec {
 };
 
 class DotOfGatherSimplificationTest
-    : public HloVerifiedTestBase,
+    : public HloTestBase,
       public ::testing::WithParamInterface<DotOfGatherTestSpec> {};
 
 // input: dot(DS(ctA), ctB))
@@ -3634,6 +4030,7 @@ class DotOfGatherSimplificationTest
 // output: DS(dot(ctA, ctB))
 // => output dimensions: DS ({M x N}, {s, 0}, {1, N}) => {1 x N}.
 TEST_P(DotOfGatherSimplificationTest, ConstantRHS) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
 
   DotOfGatherTestSpec spec = GetParam();
@@ -3680,10 +4077,10 @@ TEST_P(DotOfGatherSimplificationTest, ConstantRHS) {
   builder.AddInstruction(HloInstruction::CreateDot(
       dot_shape, ds, rhs, dot_dnums, DefaultPrecisionConfig(2)));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  TF_ASSERT_OK_AND_ASSIGN(bool run_successful, simplifier.Run(&module()));
+  TF_ASSERT_OK_AND_ASSIGN(bool run_successful, simplifier.Run(m.get()));
   ASSERT_TRUE(run_successful);
   EXPECT_TRUE(
       ShapeUtil::Equal(computation->root_instruction()->shape(), dot_shape));
@@ -3704,6 +4101,7 @@ TEST_P(DotOfGatherSimplificationTest, ConstantRHS) {
 // output: DS(dot(ctA, ctB))
 // => output dimensions: DS ({M x N}, {0, s}, {M, 1}) => {M x 1}.
 TEST_P(DotOfGatherSimplificationTest, ConstantLHS) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
 
   DotOfGatherTestSpec spec = GetParam();
@@ -3750,10 +4148,10 @@ TEST_P(DotOfGatherSimplificationTest, ConstantLHS) {
   builder.AddInstruction(HloInstruction::CreateDot(
       dot_shape, lhs, ds, dot_dnums, DefaultPrecisionConfig(2)));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  TF_ASSERT_OK_AND_ASSIGN(bool run_successful, simplifier.Run(&module()));
+  TF_ASSERT_OK_AND_ASSIGN(bool run_successful, simplifier.Run(m.get()));
   ASSERT_TRUE(run_successful);
   EXPECT_TRUE(
       ShapeUtil::Equal(computation->root_instruction()->shape(), dot_shape));
diff --git a/tensorflow/compiler/xla/service/batch_dot_simplification_test.cc b/tensorflow/compiler/xla/service/batch_dot_simplification_test.cc
index 38f1a5d3a64..52ec1a794c5 100644
--- a/tensorflow/compiler/xla/service/batch_dot_simplification_test.cc
+++ b/tensorflow/compiler/xla/service/batch_dot_simplification_test.cc
@@ -17,14 +17,13 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
-#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
 
 namespace xla {
 namespace {
 
 namespace op = xla::testing::opcode_matchers;
 
-class BatchDotSimplificationTest : public HloVerifiedTestBase {};
+class BatchDotSimplificationTest : public HloTestBase {};
 
 TEST_F(BatchDotSimplificationTest,
        ElideSingleDegenerateBatchDotDim_VectorVector) {
@@ -38,11 +37,12 @@ main {
 }
 )";
 
-  ParseAndVerifyModule(hlo_text);
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> m,
+                          ParseAndReturnVerifiedModule(hlo_text));
   BatchDotSimplification pass;
-  ASSERT_TRUE(pass.Run(&module()).ValueOrDie());
+  ASSERT_TRUE(pass.Run(m.get()).ValueOrDie());
 
-  HloInstruction* root = module().entry_computation()->root_instruction();
+  HloInstruction* root = m->entry_computation()->root_instruction();
   EXPECT_THAT(root,
               op::Reshape(op::Dot(
                   op::Reshape(op::Parameter(0)), op::Reshape(op::Parameter(1)),
@@ -61,11 +61,12 @@ main {
 }
 )";
 
-  ParseAndVerifyModule(hlo_text);
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> m,
+                          ParseAndReturnVerifiedModule(hlo_text));
   BatchDotSimplification pass;
-  ASSERT_TRUE(pass.Run(&module()).ValueOrDie());
+  ASSERT_TRUE(pass.Run(m.get()).ValueOrDie());
 
-  HloInstruction* root = module().entry_computation()->root_instruction();
+  HloInstruction* root = m->entry_computation()->root_instruction();
   EXPECT_THAT(root,
               op::Reshape(op::Dot(
                   op::Reshape(op::Parameter(0)), op::Reshape(op::Parameter(1)),
@@ -84,11 +85,12 @@ main {
 }
 )";
 
-  ParseAndVerifyModule(hlo_text);
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> m,
+                          ParseAndReturnVerifiedModule(hlo_text));
   BatchDotSimplification pass;
-  ASSERT_TRUE(pass.Run(&module()).ValueOrDie());
+  ASSERT_TRUE(pass.Run(m.get()).ValueOrDie());
 
-  HloInstruction* root = module().entry_computation()->root_instruction();
+  HloInstruction* root = m->entry_computation()->root_instruction();
   EXPECT_THAT(root,
               op::Reshape(op::Dot(
                   op::Reshape(op::Parameter(0)), op::Reshape(op::Parameter(1)),
@@ -107,11 +109,12 @@ main {
 }
 )";
 
-  ParseAndVerifyModule(hlo_text);
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> m,
+                          ParseAndReturnVerifiedModule(hlo_text));
   BatchDotSimplification pass;
-  ASSERT_TRUE(pass.Run(&module()).ValueOrDie());
+  ASSERT_TRUE(pass.Run(m.get()).ValueOrDie());
 
-  HloInstruction* root = module().entry_computation()->root_instruction();
+  HloInstruction* root = m->entry_computation()->root_instruction();
   EXPECT_THAT(root,
               op::Reshape(op::Dot(
                   op::Reshape(op::Parameter(0)), op::Reshape(op::Parameter(1)),
@@ -130,11 +133,12 @@ main {
 }
 )";
 
-  ParseAndVerifyModule(hlo_text);
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> m,
+                          ParseAndReturnVerifiedModule(hlo_text));
   BatchDotSimplification pass;
-  ASSERT_TRUE(pass.Run(&module()).ValueOrDie());
+  ASSERT_TRUE(pass.Run(m.get()).ValueOrDie());
 
-  HloInstruction* root = module().entry_computation()->root_instruction();
+  HloInstruction* root = m->entry_computation()->root_instruction();
   EXPECT_THAT(root,
               op::Reshape(op::Dot(
                   op::Reshape(op::Parameter(0)), op::Reshape(op::Parameter(1)),
@@ -153,11 +157,12 @@ main {
 }
 )";
 
-  ParseAndVerifyModule(hlo_text);
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> m,
+                          ParseAndReturnVerifiedModule(hlo_text));
   BatchDotSimplification pass;
-  ASSERT_TRUE(pass.Run(&module()).ValueOrDie());
+  ASSERT_TRUE(pass.Run(m.get()).ValueOrDie());
 
-  HloInstruction* root = module().entry_computation()->root_instruction();
+  HloInstruction* root = m->entry_computation()->root_instruction();
   EXPECT_THAT(root,
               op::Reshape(op::Dot(
                   op::Reshape(op::Parameter(0)), op::Reshape(op::Parameter(1)),
diff --git a/tensorflow/compiler/xla/service/batchnorm_expander_test.cc b/tensorflow/compiler/xla/service/batchnorm_expander_test.cc
index f7ac8f54829..08cf8026177 100644
--- a/tensorflow/compiler/xla/service/batchnorm_expander_test.cc
+++ b/tensorflow/compiler/xla/service/batchnorm_expander_test.cc
@@ -29,14 +29,14 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_pass_fix.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
-#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 
 namespace xla {
 namespace {
 
-using BatchNormExpanderTest = HloVerifiedTestBase;
+using BatchNormExpanderTest = HloTestBase;
 
 // Test that we expand BatchNormTraining.
 TEST_F(BatchNormExpanderTest, BatchNormTraining) {
@@ -59,14 +59,14 @@ TEST_F(BatchNormExpanderTest, BatchNormTraining) {
       param0, param1, param2,
       /*epsilon=*/0.001, /*feature_index=*/3));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kBatchNormTraining);
   BatchNormExpander rewriter(/*rewrite_training_op=*/true,
                              /*rewrite_inference_op=*/true,
                              /*rewrite_grad_op=*/true);
-  ASSERT_TRUE(rewriter.Run(module).ValueOrDie());
+  ASSERT_TRUE(rewriter.Run(module.get()).ValueOrDie());
   root = computation->root_instruction();
   // Make sure this operation is expanded.
   EXPECT_EQ(root->opcode(), HloOpcode::kTuple);
@@ -101,14 +101,14 @@ TEST_F(BatchNormExpanderTest, BatchNormGrad) {
       param1, param2, param3, param4,
       /*epsilon=*/0.001, /*feature_index=*/3));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kBatchNormGrad);
   BatchNormExpander rewriter(/*rewrite_training_op=*/true,
                              /*rewrite_inference_op=*/true,
                              /*rewrite_grad_op=*/true);
-  ASSERT_TRUE(rewriter.Run(module).ValueOrDie());
+  ASSERT_TRUE(rewriter.Run(module.get()).ValueOrDie());
   root = computation->root_instruction();
   // Make sure this operation is expanded.
   EXPECT_EQ(root->opcode(), HloOpcode::kTuple);
@@ -126,13 +126,13 @@ ENTRY entry {
     epsilon=0.001, feature_index=1, sharding={maximal device=1}
 })";
 
-  ParseAndVerifyModule(module_str);
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(module_str));
   BatchNormExpander rewriter(/*rewrite_training_op=*/true,
                              /*rewrite_inference_op=*/true,
                              /*rewrite_grad_op=*/true);
-  ASSERT_TRUE(rewriter.Run(&module()).ValueOrDie());
+  ASSERT_TRUE(rewriter.Run(m.get()).ValueOrDie());
 
-  for (auto* instruction : module().entry_computation()->instructions()) {
+  for (auto* instruction : m->entry_computation()->instructions()) {
     if (instruction->opcode() == HloOpcode::kParameter) {
       continue;
     }
diff --git a/tensorflow/compiler/xla/service/bfloat16_conversion_folding_test.cc b/tensorflow/compiler/xla/service/bfloat16_conversion_folding_test.cc
index 5f93740887a..4ce351acc2c 100644
--- a/tensorflow/compiler/xla/service/bfloat16_conversion_folding_test.cc
+++ b/tensorflow/compiler/xla/service/bfloat16_conversion_folding_test.cc
@@ -22,7 +22,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
-#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 
 namespace xla {
@@ -65,11 +65,11 @@ class TestBFloat16Support : public BFloat16Support {
   }
 };
 
-class BFloat16ConversionFoldingTest : public HloVerifiedTestBase {
+class BFloat16ConversionFoldingTest : public HloTestBase {
  protected:
   BFloat16ConversionFoldingTest()
-      : HloVerifiedTestBase(/*layout_sensitive=*/false,
-                            /*allow_mixed_precision=*/true) {}
+      : HloTestBase(/*verifier_layout_sensitive=*/false,
+                    /*allow_mixed_precision_in_hlo_verifier=*/true) {}
 
   bool FoldConversions(HloModule* module) {
     TestBFloat16Support bfloat16_support_;
@@ -103,10 +103,10 @@ TEST_F(BFloat16ConversionFoldingTest, FoldIfSupported) {
       HloInstruction::CreateBinary(f32_shape, HloOpcode::kAdd, convert1, c));
   builder.AddInstruction(HloInstruction::CreateConvert(bf16_shape, add1));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
-  EXPECT_TRUE(FoldConversions(module));
+  EXPECT_TRUE(FoldConversions(module.get()));
 
   EXPECT_EQ(computation->root_instruction(), add1);
   EXPECT_EQ(add0->shape().element_type(), BF16);
@@ -138,10 +138,10 @@ TEST_F(BFloat16ConversionFoldingTest, DoNotFoldIfUnsupported) {
   HloInstruction* convert2 =
       builder.AddInstruction(HloInstruction::CreateConvert(bf16_shape, mul1));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
-  EXPECT_FALSE(FoldConversions(module));
+  EXPECT_FALSE(FoldConversions(module.get()));
 
   EXPECT_EQ(computation->root_instruction(), convert2);
   EXPECT_EQ(mul0->shape().element_type(), F32);
@@ -173,10 +173,10 @@ TEST_F(BFloat16ConversionFoldingTest, DoNotFoldUnsupportedMixedPrecision) {
   HloInstruction* convert2 =
       builder.AddInstruction(HloInstruction::CreateConvert(bf16_shape, sub1));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
-  EXPECT_FALSE(FoldConversions(module));
+  EXPECT_FALSE(FoldConversions(module.get()));
 
   EXPECT_EQ(computation->root_instruction(), convert2);
   EXPECT_EQ(sub0->shape().element_type(), F32);
@@ -203,10 +203,10 @@ TEST_F(BFloat16ConversionFoldingTest, DoNotFoldTuple) {
   HloInstruction* convert1 =
       builder.AddInstruction(HloInstruction::CreateConvert(bf16_shape, gte));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
-  EXPECT_FALSE(FoldConversions(module));
+  EXPECT_FALSE(FoldConversions(module.get()));
 
   EXPECT_EQ(computation->root_instruction(), convert1);
   EXPECT_EQ(gte->shape().element_type(), F32);
@@ -216,7 +216,7 @@ TEST_F(BFloat16ConversionFoldingTest, DoNotFoldTuple) {
 TEST_F(BFloat16ConversionFoldingTest, FoldCrossReplicaSumTupleOutput) {
   auto builder = HloComputation::Builder(TestName());
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation::Builder sum_builder("add");
   auto x = sum_builder.AddInstruction(HloInstruction::CreateParameter(
       /*parameter_number=*/0, ShapeUtil::MakeShape(F32, {}), "x"));
@@ -252,7 +252,7 @@ TEST_F(BFloat16ConversionFoldingTest, FoldCrossReplicaSumTupleOutput) {
 
   auto computation = module->AddEntryComputation(builder.Build());
 
-  EXPECT_TRUE(FoldConversions(module));
+  EXPECT_TRUE(FoldConversions(module.get()));
 
   EXPECT_EQ(computation->root_instruction(), tuple);
   EXPECT_EQ(tuple->operand(0), gte_a);
diff --git a/tensorflow/compiler/xla/service/bfloat16_normalization_test.cc b/tensorflow/compiler/xla/service/bfloat16_normalization_test.cc
index cb075a5e38a..9f97d18c565 100644
--- a/tensorflow/compiler/xla/service/bfloat16_normalization_test.cc
+++ b/tensorflow/compiler/xla/service/bfloat16_normalization_test.cc
@@ -23,7 +23,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
-#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 
 namespace xla {
@@ -68,11 +68,11 @@ class TestBFloat16Support : public BFloat16Support {
   }
 };
 
-class BFloat16NormalizationTest : public HloVerifiedTestBase {
+class BFloat16NormalizationTest : public HloTestBase {
  protected:
   BFloat16NormalizationTest()
-      : HloVerifiedTestBase(/*layout_sensitive=*/false,
-                            /*allow_mixed_precision=*/true) {}
+      : HloTestBase(/*verifier_layout_sensitive=*/false,
+                    /*allow_mixed_precision_in_hlo_verifier=*/true) {}
 
   bool Normalize(HloModule* module) {
     TestBFloat16Support bfloat16_support_;
@@ -106,10 +106,10 @@ TEST_F(BFloat16NormalizationTest, NoopIfSupported) {
   HloInstruction* add1 = builder.AddInstruction(
       HloInstruction::CreateBinary(f32_shape, HloOpcode::kAdd, add0, c));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
-  EXPECT_FALSE(Normalize(module));
+  EXPECT_FALSE(Normalize(module.get()));
 
   EXPECT_EQ(computation->root_instruction(), add1);
   EXPECT_EQ(add0->shape().element_type(), BF16);
@@ -134,10 +134,10 @@ TEST_F(BFloat16NormalizationTest, ResolveIfUnsupportedBF16) {
   HloInstruction* mul1 = builder.AddInstruction(
       HloInstruction::CreateBinary(bf16_shape, HloOpcode::kMultiply, mul0, c));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
-  EXPECT_TRUE(Normalize(module));
+  EXPECT_TRUE(Normalize(module.get()));
 
   EXPECT_EQ(computation->root_instruction()->opcode(), HloOpcode::kConvert);
   EXPECT_EQ(computation->root_instruction()->operand(0), mul1);
@@ -164,10 +164,10 @@ TEST_F(BFloat16NormalizationTest, ResolveUnsupportedMixedPrecisionSubtraction) {
   HloInstruction* sub1 = builder.AddInstruction(
       HloInstruction::CreateBinary(bf16_shape, HloOpcode::kSubtract, sub0, c));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
-  EXPECT_TRUE(Normalize(module));
+  EXPECT_TRUE(Normalize(module.get()));
 
   EXPECT_EQ(computation->root_instruction()->opcode(), HloOpcode::kConvert);
   EXPECT_EQ(computation->root_instruction()->operand(0), sub1);
@@ -191,7 +191,7 @@ TEST_F(BFloat16NormalizationTest, ResolveUnsupportedMixedPrecisionReduce) {
       HloInstruction::CreateBinary(bf16_scalar_shape, HloOpcode::kAdd,
                                    reduce_comp_param0, reduce_comp_param1));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto reduce_computation =
       module->AddEmbeddedComputation(reduce_comp_builder.Build());
 
@@ -205,7 +205,7 @@ TEST_F(BFloat16NormalizationTest, ResolveUnsupportedMixedPrecisionReduce) {
 
   auto computation = module->AddEntryComputation(builder.Build());
 
-  EXPECT_TRUE(Normalize(module));
+  EXPECT_TRUE(Normalize(module.get()));
 
   EXPECT_EQ(computation->root_instruction(), reduce);
   EXPECT_EQ(reduce->called_computations().size(), 1);
@@ -233,7 +233,7 @@ TEST_F(BFloat16NormalizationTest, ResolveUnsupportedMixedPrecisionReduce) {
 }
 
 TEST_F(BFloat16NormalizationTest, ResolveMixedPrecisionTupleCrossReplicaSum) {
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation::Builder sum_builder("sum");
   auto x = sum_builder.AddInstruction(HloInstruction::CreateParameter(
       /*parameter_number=*/0, ShapeUtil::MakeShape(F32, {}), "x"));
@@ -263,7 +263,7 @@ TEST_F(BFloat16NormalizationTest, ResolveMixedPrecisionTupleCrossReplicaSum) {
 
   auto computation = module->AddEntryComputation(builder.Build());
 
-  EXPECT_TRUE(Normalize(module));
+  EXPECT_TRUE(Normalize(module.get()));
 
   EXPECT_EQ(computation->root_instruction(), gte);
   EXPECT_EQ(gte->shape().element_type(), BF16);
@@ -272,7 +272,7 @@ TEST_F(BFloat16NormalizationTest, ResolveMixedPrecisionTupleCrossReplicaSum) {
 }
 
 TEST_F(BFloat16NormalizationTest, ResolveMixedPrecisionTupleSort) {
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto builder = HloComputation::Builder(TestName());
   Shape f32_shape = ShapeUtil::MakeShape(F32, {1024});
   Shape bf16_shape = ShapeUtil::MakeShape(BF16, {1024});
@@ -290,7 +290,7 @@ TEST_F(BFloat16NormalizationTest, ResolveMixedPrecisionTupleSort) {
 
   auto computation = module->AddEntryComputation(builder.Build());
 
-  EXPECT_TRUE(Normalize(module));
+  EXPECT_TRUE(Normalize(module.get()));
 
   EXPECT_EQ(computation->root_instruction(), gte);
   EXPECT_EQ(gte->shape().element_type(), BF16);
@@ -299,7 +299,7 @@ TEST_F(BFloat16NormalizationTest, ResolveMixedPrecisionTupleSort) {
 }
 
 TEST_F(BFloat16NormalizationTest, ResolveMixedPrecisionTupleSortRoot) {
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto builder = HloComputation::Builder(TestName());
   Shape f32_shape = ShapeUtil::MakeShape(F32, {1024});
   Shape bf16_shape = ShapeUtil::MakeShape(BF16, {1024});
@@ -314,7 +314,7 @@ TEST_F(BFloat16NormalizationTest, ResolveMixedPrecisionTupleSortRoot) {
 
   auto computation = module->AddEntryComputation(builder.Build());
 
-  EXPECT_TRUE(Normalize(module));
+  EXPECT_TRUE(Normalize(module.get()));
 
   EXPECT_EQ(sort->operand(0)->shape().element_type(), F32);
   EXPECT_EQ(ShapeUtil::GetSubshape(sort->shape(), {0}).element_type(), F32);
@@ -342,10 +342,10 @@ TEST_F(BFloat16NormalizationTest, DoNotAddUnsupportedMixedPrecision) {
   HloInstruction* dot = builder.AddInstruction(
       HloInstruction::CreateDot(bf16_shape, a, b, dot_dnums, precision_config));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
-  EXPECT_TRUE(Normalize(module));
+  EXPECT_TRUE(Normalize(module.get()));
 
   EXPECT_EQ(computation->root_instruction()->opcode(), HloOpcode::kConvert);
   EXPECT_EQ(dot->shape().element_type(), F32);
diff --git a/tensorflow/compiler/xla/service/bfloat16_propagation_test.cc b/tensorflow/compiler/xla/service/bfloat16_propagation_test.cc
index 0af71eaac96..5be7141aae4 100644
--- a/tensorflow/compiler/xla/service/bfloat16_propagation_test.cc
+++ b/tensorflow/compiler/xla/service/bfloat16_propagation_test.cc
@@ -22,7 +22,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
-#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 
@@ -55,11 +55,11 @@ class TestBFloat16Support : public BFloat16Support {
   }
 };
 
-class BFloat16PropagationTest : public HloVerifiedTestBase {
+class BFloat16PropagationTest : public HloTestBase {
  protected:
   BFloat16PropagationTest()
-      : HloVerifiedTestBase(/*layout_sensitive=*/false,
-                            /*allow_mixed_precision=*/true) {}
+      : HloTestBase(/*verifier_layout_sensitive=*/false,
+                    /*allow_mixed_precision_in_hlo_verifier=*/true) {}
 
   // Runs the propagation pass on the given module, and returns whether the
   // module is changed after this pass.
@@ -121,10 +121,10 @@ TEST_F(BFloat16PropagationTest, PropagateThroughSelectButNotAdd) {
   HloInstruction* root = builder.AddInstruction(HloInstruction::CreateBinary(
       ShapeUtil::MakeShape(F32, {4, 4}), HloOpcode::kAdd, dot, dot));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
-  EXPECT_TRUE(PropagatePrecision(module));
+  EXPECT_TRUE(PropagatePrecision(module.get()));
 
   EXPECT_EQ(computation->root_instruction(), root);
   EXPECT_TRUE(OutputsBF16(xpose));
@@ -136,6 +136,62 @@ TEST_F(BFloat16PropagationTest, PropagateThroughSelectButNotAdd) {
   EXPECT_FALSE(OutputsBF16(c));
 }
 
+TEST_F(BFloat16PropagationTest, PropagateThroughMaxPoolReduceWindow) {
+  auto module = CreateNewVerifiedModule();
+
+  auto sub_builder = HloComputation::Builder("max");
+  HloInstruction* p0 = sub_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, ShapeUtil::MakeShape(F32, {}), "a"));
+  HloInstruction* p1 = sub_builder.AddInstruction(
+      HloInstruction::CreateParameter(1, ShapeUtil::MakeShape(F32, {}), "b"));
+  sub_builder.AddInstruction(HloInstruction::CreateBinary(
+      ShapeUtil::MakeShape(F32, {}), HloOpcode::kMaximum, p0, p1));
+  auto max_computation = module->AddEmbeddedComputation(sub_builder.Build());
+
+  auto builder = HloComputation::Builder(TestName());
+  Shape shape = ShapeUtil::MakeShape(F32, {2, 4});
+
+  HloInstruction* a =
+      builder.AddInstruction(HloInstruction::CreateParameter(0, shape, "a"));
+  HloInstruction* b =
+      builder.AddInstruction(HloInstruction::CreateParameter(1, shape, "b"));
+  HloInstruction* c =
+      builder.AddInstruction(HloInstruction::CreateParameter(2, shape, "c"));
+  HloInstruction* add = builder.AddInstruction(
+      HloInstruction::CreateBinary(shape, HloOpcode::kAdd, a, b));
+  Window window;
+  WindowDimension dim;
+  dim.set_size(2);
+  dim.set_stride(1);
+  dim.set_padding_high(1);
+  dim.set_window_dilation(1);
+  dim.set_base_dilation(1);
+  *window.add_dimensions() = dim;
+  *window.add_dimensions() = dim;
+  HloInstruction* rw =
+      builder.AddInstruction(HloInstruction::CreateReduceWindow(
+          shape, add,
+          builder.AddInstruction(
+              HloInstruction::CreateConstant(LiteralUtil::Zero(F32))),
+          window, max_computation));
+  HloInstruction* xpose =
+      builder.AddInstruction(HloInstruction::CreateTranspose(
+          ShapeUtil::MakeShape(F32, {4, 2}), c, {1, 0}));
+  HloInstruction* dot = builder.AddInstruction(
+      CreateDot(ShapeUtil::MakeShape(F32, {4, 4}), xpose, rw));
+  HloInstruction* root = builder.AddInstruction(HloInstruction::CreateBinary(
+      ShapeUtil::MakeShape(F32, {4, 4}), HloOpcode::kAdd, dot, dot));
+
+  auto computation = module->AddEntryComputation(builder.Build());
+
+  EXPECT_TRUE(PropagatePrecision(module.get()));
+
+  EXPECT_EQ(computation->root_instruction(), root);
+  EXPECT_TRUE(OutputsBF16(add));
+  EXPECT_TRUE(OutputsBF16(xpose));
+  EXPECT_TRUE(OutputsBF16(rw));
+}
+
 // Tests that side-effecting all-reduce should not be changed.
 TEST_F(BFloat16PropagationTest, DoNotChangeAllReduce) {
   auto module = CreateNewVerifiedModule();
@@ -186,10 +242,10 @@ TEST_F(BFloat16PropagationTest, ConvertConstantLiteral) {
       HloInstruction::CreateConstant(LiteralUtil::CreateFromArray(array_b)));
   HloInstruction* dot = builder.AddInstruction(CreateDot(shape, a, b));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
-  EXPECT_TRUE(PropagatePrecision(module));
+  EXPECT_TRUE(PropagatePrecision(module.get()));
 
   EXPECT_EQ(computation->root_instruction(), dot);
   EXPECT_TRUE(OutputsBF16(dot->operand(0)));
@@ -242,10 +298,10 @@ TEST_F(BFloat16PropagationTest, PropagateThroughTuples) {
   HloInstruction* output_tuple =
       builder.AddInstruction(HloInstruction::CreateTuple({dot, add2}));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
-  EXPECT_TRUE(PropagatePrecision(module));
+  EXPECT_TRUE(PropagatePrecision(module.get()));
 
   EXPECT_EQ(computation->root_instruction(), output_tuple);
   EXPECT_TRUE(OutputsBF16(xpose));
@@ -281,10 +337,10 @@ TEST_F(BFloat16PropagationTest, SameValueReferencedTwice) {
   HloInstruction* dot = builder.AddInstruction(
       CreateDot(ShapeUtil::MakeShape(F32, {4, 4}), lhs, rhs));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
-  EXPECT_TRUE(PropagatePrecision(module));
+  EXPECT_TRUE(PropagatePrecision(module.get()));
 
   EXPECT_EQ(computation->root_instruction(), dot);
   EXPECT_TRUE(OutputsBF16(add1));
@@ -310,10 +366,10 @@ TEST_F(BFloat16PropagationTest, DoNotChangeComputationRoot) {
   HloInstruction* tuple =
       builder.AddInstruction(HloInstruction::CreateTuple({add, dot}));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
-  EXPECT_FALSE(PropagatePrecision(module));
+  EXPECT_FALSE(PropagatePrecision(module.get()));
 
   EXPECT_EQ(computation->root_instruction(), tuple);
   EXPECT_FALSE(OutputsBF16(add));
@@ -321,7 +377,7 @@ TEST_F(BFloat16PropagationTest, DoNotChangeComputationRoot) {
 
 // Tests that BF16 is propagated properly through fused computations.
 TEST_F(BFloat16PropagationTest, PropagateThroughFusion) {
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto builder = HloComputation::Builder(TestName());
   Shape shape = ShapeUtil::MakeShape(F32, {4, 4});
 
@@ -356,7 +412,7 @@ TEST_F(BFloat16PropagationTest, PropagateThroughFusion) {
 
   auto computation = module->AddEntryComputation(builder.Build());
 
-  EXPECT_TRUE(PropagatePrecision(module));
+  EXPECT_TRUE(PropagatePrecision(module.get()));
 
   EXPECT_EQ(computation->root_instruction(), fusion1);
   EXPECT_TRUE(OutputsBF16(add));
@@ -369,7 +425,7 @@ TEST_F(BFloat16PropagationTest, PropagateThroughFusion) {
 // Tests that changes to BF16 that cannot be propagated outside a fusion are
 // discarded.
 TEST_F(BFloat16PropagationTest, DiscardFusionInternalBF16Changes) {
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto builder = HloComputation::Builder(TestName());
   Shape shape = ShapeUtil::MakeShape(F32, {4, 4});
 
@@ -393,7 +449,7 @@ TEST_F(BFloat16PropagationTest, DiscardFusionInternalBF16Changes) {
 
   auto computation = module->AddEntryComputation(builder.Build());
 
-  EXPECT_FALSE(PropagatePrecision(module));
+  EXPECT_FALSE(PropagatePrecision(module.get()));
   EXPECT_EQ(computation->root_instruction(), fusion);
 }
 
@@ -408,7 +464,7 @@ TEST_F(BFloat16PropagationTest, DiscardFusionInternalBF16Changes) {
 //   (BF16, BF16) fusion_computation(F32 a, F32 b)
 //     = tuple(BF16 convert(a), BF16 add(F32 a, F32 b))
 TEST_F(BFloat16PropagationTest, ConvertTupleFusionElementIfUsedByAdd) {
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto builder = HloComputation::Builder(TestName());
   Shape shape = ShapeUtil::MakeShape(F32, {4, 4});
 
@@ -439,7 +495,7 @@ TEST_F(BFloat16PropagationTest, ConvertTupleFusionElementIfUsedByAdd) {
 
   auto computation = module->AddEntryComputation(builder.Build());
 
-  EXPECT_TRUE(PropagatePrecision(module));
+  EXPECT_TRUE(PropagatePrecision(module.get()));
 
   EXPECT_EQ(computation->root_instruction(), dot);
   EXPECT_TRUE(OutputsBF16(gte0));
@@ -458,7 +514,7 @@ TEST_F(BFloat16PropagationTest, ConvertTupleFusionElementIfUsedByAdd) {
 // on_true and on_false must match, so that as long as one of them is F32, the
 // other must be F32 as well.
 TEST_F(BFloat16PropagationTest, SelectOverTuples) {
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto builder = HloComputation::Builder(TestName());
   Shape shape = ShapeUtil::MakeShape(F32, {2, 4});
 
@@ -489,7 +545,7 @@ TEST_F(BFloat16PropagationTest, SelectOverTuples) {
 
   auto computation = module->AddEntryComputation(builder.Build());
 
-  EXPECT_TRUE(PropagatePrecision(module));
+  EXPECT_TRUE(PropagatePrecision(module.get()));
 
   EXPECT_EQ(computation->root_instruction(), dot);
   EXPECT_FALSE(OutputsBF16(add0));
@@ -502,7 +558,7 @@ TEST_F(BFloat16PropagationTest, SelectOverTuples) {
 // Tests that BF16 is propagated properly through a while computation with
 // non-tuple input/output.
 TEST_F(BFloat16PropagationTest, PropagateThroughSimpleWhile) {
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto builder = HloComputation::Builder(TestName());
   Shape shape = ShapeUtil::MakeShape(F32, {4, 4});
 
@@ -545,7 +601,7 @@ TEST_F(BFloat16PropagationTest, PropagateThroughSimpleWhile) {
   auto dot = builder.AddInstruction(CreateDot(shape, while_hlo, while_hlo));
   auto computation = module->AddEntryComputation(builder.Build());
 
-  EXPECT_TRUE(PropagatePrecision(module));
+  EXPECT_TRUE(PropagatePrecision(module.get()));
 
   EXPECT_EQ(computation->root_instruction(), dot);
   EXPECT_TRUE(
@@ -561,7 +617,7 @@ TEST_F(BFloat16PropagationTest, PropagateThroughSimpleWhile) {
 // made to the while body and thus the fusion node inside it.
 TEST_F(BFloat16PropagationTest,
        ConditionPreventsPropagationForFusionInsideWhile) {
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto builder = HloComputation::Builder(TestName());
   Shape shape = ShapeUtil::MakeShape(F32, {4, 4});
 
@@ -610,7 +666,7 @@ TEST_F(BFloat16PropagationTest,
   auto dot = builder.AddInstruction(CreateDot(shape, while_hlo, while_hlo));
   auto computation = module->AddEntryComputation(builder.Build());
 
-  EXPECT_FALSE(PropagatePrecision(module));
+  EXPECT_FALSE(PropagatePrecision(module.get()));
   EXPECT_EQ(computation->root_instruction(), dot);
   EXPECT_FALSE(OutputsBF16(add));
   EXPECT_FALSE(OutputsBF16(body_fusion));
@@ -622,7 +678,7 @@ TEST_F(BFloat16PropagationTest,
 // Tests that BF16 is propagated properly through while computations with
 // tuple-shaped input/output.
 TEST_F(BFloat16PropagationTest, PropagateThroughTupleWhile) {
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto builder = HloComputation::Builder(TestName());
   Shape shape = ShapeUtil::MakeShape(F32, {4, 4});
 
@@ -690,7 +746,7 @@ TEST_F(BFloat16PropagationTest, PropagateThroughTupleWhile) {
   auto dot = builder.AddInstruction(CreateDot(shape, lhs, rhs));
   auto computation = module->AddEntryComputation(builder.Build());
 
-  EXPECT_TRUE(PropagatePrecision(module));
+  EXPECT_TRUE(PropagatePrecision(module.get()));
 
   EXPECT_EQ(computation->root_instruction(), dot);
   EXPECT_TRUE(OutputsBF16(lhs));
@@ -709,7 +765,7 @@ TEST_F(BFloat16PropagationTest, PropagateThroughTupleWhile) {
 // Tests that BF16 is not propagated through multiple whiles that invoke the
 // same computation as long as one while prevents the propagation.
 TEST_F(BFloat16PropagationTest, DoNotPropagateWhilesCallingSameComputation) {
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto builder = HloComputation::Builder(TestName());
   Shape shape = ShapeUtil::MakeShape(F32, {4, 4});
 
@@ -820,7 +876,7 @@ TEST_F(BFloat16PropagationTest, DoNotPropagateWhilesCallingSameComputation) {
   auto dot = builder.AddInstruction(CreateDot(shape, lhs, rhs));
   auto computation = module->AddEntryComputation(builder.Build());
 
-  EXPECT_TRUE(PropagatePrecision(module));
+  EXPECT_TRUE(PropagatePrecision(module.get()));
   EXPECT_FALSE(OutputsBF16(body_dot));
   EXPECT_FALSE(OutputsBF16(body_rhs));
   EXPECT_FALSE(OutputsBF16(body_lhs));
@@ -859,10 +915,10 @@ TEST_F(BFloat16PropagationTest, NoopConversionRemoved) {
   HloInstruction* add2 = builder.AddInstruction(HloInstruction::CreateBinary(
       bf16_shape, HloOpcode::kAdd, convert0, convert1));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
-  EXPECT_TRUE(PropagatePrecision(module));
+  EXPECT_TRUE(PropagatePrecision(module.get()));
 
   EXPECT_EQ(computation->root_instruction(), add2);
   EXPECT_EQ(add2->operand(0), add0);
@@ -895,10 +951,10 @@ TEST_F(BFloat16PropagationTest, TupleDomain) {
   HloInstruction* root = builder.AddInstruction(
       HloInstruction::CreateBinary(shape, HloOpcode::kAdd, dot, dot));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
-  EXPECT_TRUE(PropagatePrecision(module));
+  EXPECT_TRUE(PropagatePrecision(module.get()));
   EXPECT_EQ(computation->root_instruction(), root);
 
   // test BF16 propagated through domain
@@ -941,10 +997,10 @@ TEST_F(BFloat16PropagationTest, TupleDomainNoPropagation) {
   HloInstruction* root = builder.AddInstruction(
       HloInstruction::CreateBinary(shape, HloOpcode::kAdd, dot, dot));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
-  EXPECT_TRUE(PropagatePrecision(module));
+  EXPECT_TRUE(PropagatePrecision(module.get()));
 
   EXPECT_EQ(computation->root_instruction(), root);
   EXPECT_TRUE(OutputsBF16(a_trans));
diff --git a/tensorflow/compiler/xla/service/bfloat16_support.cc b/tensorflow/compiler/xla/service/bfloat16_support.cc
index 5b48f10505e..2b9502f63a8 100644
--- a/tensorflow/compiler/xla/service/bfloat16_support.cc
+++ b/tensorflow/compiler/xla/service/bfloat16_support.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/compiler/xla/service/bfloat16_support.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 
@@ -107,6 +108,21 @@ bool BFloat16Support::EffectiveOperandPrecisionIsOutputPrecision(
     case HloOpcode::kSelect:
     case HloOpcode::kTupleSelect:
       return operand_index == 1 || operand_index == 2;
+    case HloOpcode::kReduce:
+    case HloOpcode::kReduceWindow: {
+      HloComputation* reduce_comp = hlo.called_computations()[0];
+      for (HloInstruction* inst : reduce_comp->instructions()) {
+        if (inst->opcode() == HloOpcode::kParameter) {
+          continue;
+        }
+        for (int64 i = 0; i < inst->operand_count(); ++i) {
+          if (!EffectiveOperandPrecisionIsOutputPrecision(*inst, i)) {
+            return false;
+          }
+        }
+      }
+      return true;
+    }
     default:
       break;
   }
diff --git a/tensorflow/compiler/xla/service/buffer_assignment.cc b/tensorflow/compiler/xla/service/buffer_assignment.cc
index ee4e5942731..40c012a5e42 100644
--- a/tensorflow/compiler/xla/service/buffer_assignment.cc
+++ b/tensorflow/compiler/xla/service/buffer_assignment.cc
@@ -641,7 +641,7 @@ Status BufferAssignment::ComputeSummaryStats() {
   bool schedule_complete = true;
   for (const auto& computation : module_->computations()) {
     if (!computation->IsFusionComputation()) {
-      const std::vector<const HloInstruction*>* sequence =
+      const HloInstructionSequence* sequence =
           liveness_->hlo_ordering().SequentialOrder(*computation);
       if (sequence == nullptr) {
         schedule_complete = false;
@@ -1180,7 +1180,7 @@ Status BufferAssigner::AssignBuffersWithSequentialOrdering(
       const HloComputation* computation = pair.first;
       const flat_hash_set<const LogicalBuffer*>& buffers_to_assign =
           pair.second;
-      const std::vector<const HloInstruction*>* instruction_sequence =
+      const HloInstructionSequence* instruction_sequence =
           hlo_ordering.SequentialOrder(*computation);
       CHECK(instruction_sequence != nullptr) << computation->name();
       schedule.set_sequence(computation, *instruction_sequence);
@@ -1215,7 +1215,7 @@ Status BufferAssigner::AssignBuffersWithSequentialOrdering(
       const HloComputation* computation = pair.first;
       const flat_hash_set<const LogicalBuffer*>& buffers_to_assign =
           pair.second;
-      const std::vector<const HloInstruction*>* instruction_sequence =
+      const HloInstructionSequence* instruction_sequence =
           hlo_ordering.SequentialOrder(*computation);
       CHECK(instruction_sequence != nullptr) << computation->name();
       auto color_map = SplitBuffersByColor(buffers_to_assign);
@@ -1230,7 +1230,7 @@ Status BufferAssigner::AssignBuffersWithSequentialOrdering(
         TF_ASSIGN_OR_RETURN(
             const HeapSimulator::Result result,
             HeapSimulator::Run(get_heap_algorithm(alignment), *computation,
-                               HloInstructionSequence(*instruction_sequence),
+                               *instruction_sequence,
                                assignment->points_to_analysis(),
                                assignment->buffer_size_, options));
         AssignBuffersFromHeapSimulator(result, assignment,
diff --git a/tensorflow/compiler/xla/service/buffer_assignment_test.cc b/tensorflow/compiler/xla/service/buffer_assignment_test.cc
index 327211d3efd..b1fc50cb188 100644
--- a/tensorflow/compiler/xla/service/buffer_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/buffer_assignment_test.cc
@@ -38,7 +38,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
-#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
@@ -81,7 +81,7 @@ const std::vector<const HloInstruction*> GetInstructions(HloInstruction* root) {
   return main_list.GetInstructions();
 }
 
-class BufferAssignmentTest : public HloVerifiedTestBase {
+class BufferAssignmentTest : public HloTestBase {
  protected:
   ~BufferAssignmentTest() override {}
 
@@ -334,16 +334,16 @@ TEST_F(BufferAssignmentTest, ScalarConstant) {
   auto builder = HloComputation::Builder(TestName());
   auto const0 = builder.AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0)));
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   {
-    auto buffers = RunBufferAssignment(module);
+    auto buffers = RunBufferAssignment(module.get());
     EXPECT_TRUE(buffers->HasTopLevelAllocation(const0));
   }
 
   {
-    auto buffers = RunBufferAssignmentNoBuffersForConstants(module);
+    auto buffers = RunBufferAssignmentNoBuffersForConstants(module.get());
     EXPECT_FALSE(buffers->HasTopLevelAllocation(const0));
   }
 }
@@ -358,17 +358,17 @@ TEST_F(BufferAssignmentTest, BufferForConst) {
       LiteralUtil::CreateR1<float>({4.1f, 4.2f, 4.3f, 4.4f})));
   auto add = builder.AddInstruction(
       HloInstruction::CreateBinary(f32vec4_, HloOpcode::kAdd, const0, const1));
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   {
-    auto buffers = RunBufferAssignment(module);
+    auto buffers = RunBufferAssignment(module.get());
     EXPECT_TRUE(buffers->HasTopLevelAllocation(const0));
     EXPECT_TRUE(buffers->HasTopLevelAllocation(const1));
     GetAssignedOutputAllocation(*buffers, add);
   }
   {
-    auto buffers = RunBufferAssignmentNoBuffersForConstants(module);
+    auto buffers = RunBufferAssignmentNoBuffersForConstants(module.get());
     EXPECT_FALSE(buffers->HasTopLevelAllocation(const0));
     EXPECT_FALSE(buffers->HasTopLevelAllocation(const1));
     GetAssignedOutputAllocation(*buffers, add);
@@ -387,10 +387,10 @@ TEST_F(BufferAssignmentTest, HasAllocationAt) {
       HloInstruction::CreateUnary(f32vec100_, HloOpcode::kNegate, param0));
   auto tuple = builder.AddInstruction(
       HloInstruction::CreateTuple({negate, param0, constant}));
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
-  auto buffers = RunBufferAssignment(module);
+  auto buffers = RunBufferAssignment(module.get());
   // Make sure that HasAllocationAt() agrees with what HasTopLevelAllocation()
   // reports for the instruction directly.
   EXPECT_EQ(buffers->HasTopLevelAllocation(tuple),
@@ -410,10 +410,10 @@ TEST_F(BufferAssignmentTest, BufferForOutputConst) {
       LiteralUtil::CreateR1<float>({1.1f, 2.2f, 3.3f, 4.4f})));
   auto copy = builder.AddInstruction(
       HloInstruction::CreateUnary(const0->shape(), HloOpcode::kCopy, const0));
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
-  auto buffers = RunBufferAssignment(module);
+  auto buffers = RunBufferAssignment(module.get());
   // The copy node now has an output buffer.
   GetAssignedOutputAllocation(*buffers, copy);
 }
@@ -439,10 +439,10 @@ TEST_F(BufferAssignmentTest, Basic) {
       HloInstruction::CreateBinary(f32vec100_, HloOpcode::kAdd, mul, param1));
   auto sub = builder.AddInstruction(HloInstruction::CreateBinary(
       f32vec100_, HloOpcode::kSubtract, add, param1));
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
-  auto buffers = RunBufferAssignment(module);
+  auto buffers = RunBufferAssignment(module.get());
 
   // Distinct input buffers were assigned for parameters.
   BufferAllocation paramscalar_buffer =
@@ -538,7 +538,7 @@ TEST_F(BufferAssignmentTest, BasicUniquelyColored) {
       HloInstruction::CreateBinary(f32vec100_, HloOpcode::kAdd, mul, param1));
   auto sub = builder.AddInstruction(HloInstruction::CreateBinary(
       f32vec100_, HloOpcode::kSubtract, add, param1));
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   auto colorer = [](const BufferLiveness& buffer_liveness) {
@@ -553,7 +553,7 @@ TEST_F(BufferAssignmentTest, BasicUniquelyColored) {
     return Status::OK();
   };
 
-  auto buffers = RunColoredBufferAssignment(module, colorer);
+  auto buffers = RunColoredBufferAssignment(module.get(), colorer);
 
   // Distinct input buffers were assigned for parameters.
   BufferAllocation paramscalar_buffer =
@@ -599,7 +599,7 @@ TEST_F(BufferAssignmentTest, BasicPartiallyColored) {
       HloInstruction::CreateBinary(f32vec100_, HloOpcode::kAdd, mul, param1));
   auto sub = builder.AddInstruction(HloInstruction::CreateBinary(
       f32vec100_, HloOpcode::kSubtract, add, param1));
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   auto colorer = [](const BufferLiveness& buffer_liveness) {
@@ -622,7 +622,7 @@ TEST_F(BufferAssignmentTest, BasicPartiallyColored) {
     return Status::OK();
   };
 
-  auto buffers = RunColoredBufferAssignment(module, colorer);
+  auto buffers = RunColoredBufferAssignment(module.get(), colorer);
 
   // Distinct input buffers were assigned for parameters.
   BufferAllocation paramscalar_buffer =
@@ -671,10 +671,10 @@ TEST_F(BufferAssignmentTest, MultipleUsersForNode) {
       HloInstruction::CreateBinary(f32vec100_, HloOpcode::kAdd, mul, param1));
   auto sub = builder.AddInstruction(
       HloInstruction::CreateBinary(f32vec100_, HloOpcode::kSubtract, add, mul));
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
-  auto buffers = RunBufferAssignment(module);
+  auto buffers = RunBufferAssignment(module.get());
 
   // Input buffers were assigned for parameters.
   BufferAllocation paramscalar_buffer =
@@ -706,7 +706,7 @@ TEST_F(BufferAssignmentTest, TrivialMap) {
   // param0[100x10] ---> (map x+1)
   //
   // Builds the map function.
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto map_computation =
       module->AddEmbeddedComputation(BuildMapComputationPlus1("f32+1"));
   auto inner_last = map_computation->root_instruction();
@@ -725,7 +725,7 @@ TEST_F(BufferAssignmentTest, TrivialMap) {
   EXPECT_EQ(3, level1.size()) << "Invalid nested add+1 size";
 
   // Assigns buffers and fetches sizes.
-  auto buffers = RunBufferAssignment(module);
+  auto buffers = RunBufferAssignment(module.get());
   int64 size0 = ValidateBuffers(level0, *buffers);
   int64 size1 = ValidateBuffers(level1, *buffers);
 
@@ -761,7 +761,7 @@ TEST_F(BufferAssignmentTest, CannotReuseInputBufferOfReduce) {
   // out-of-order reductions could overwrite an element before a use.)
   //
   // param0[100] --- (exp1) --- (exp2) --- (reduce x+y) --- (exp3)
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto reduce_computation =
       module->AddEmbeddedComputation(BuildReduceComputation("f32+f32"));
 
@@ -784,7 +784,7 @@ TEST_F(BufferAssignmentTest, CannotReuseInputBufferOfReduce) {
 
   module->AddEntryComputation(builder.Build());
 
-  auto buffers = RunBufferAssignment(module);
+  auto buffers = RunBufferAssignment(module.get());
   const std::vector<const HloInstruction*> instrs = GetInstructions(exp3);
   ValidateBuffers(instrs, *buffers);
 
@@ -812,7 +812,7 @@ TEST_F(BufferAssignmentTest, ExampleWhile) {
   // const4[f32[4]] --- tuple --- while[condition, body]
   //
   // Builds the nested condition and body.
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto condition_computation =
       module->AddEmbeddedComputation(BuildWhileConditionComputation("if<4"));
   auto body_computation =
@@ -840,7 +840,7 @@ TEST_F(BufferAssignmentTest, ExampleWhile) {
   EXPECT_EQ(8, levelb.size()) << "Invalid nested body size";
 
   // Assigns buffers and fetches sizes.
-  auto buffers = RunBufferAssignment(module);
+  auto buffers = RunBufferAssignment(module.get());
   int64 size0 = ValidateBuffers(level0, *buffers);
   int64 sizec = ValidateBuffers(levelc, *buffers);
   int64 sizeb = ValidateBuffers(levelb, *buffers);
@@ -878,7 +878,7 @@ TEST_F(BufferAssignmentTest, ExampleWhile) {
 }
 
 TEST_F(BufferAssignmentTest, ExampleConditional) {
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto true_computation = module->AddEmbeddedComputation(
       BuildR0F32UnaryOpComputation(HloOpcode::kCeil, "Ceil"));
   auto false_computation = module->AddEmbeddedComputation(
@@ -905,7 +905,7 @@ TEST_F(BufferAssignmentTest, ExampleConditional) {
   EXPECT_EQ(2, true_instrs.size());
   EXPECT_EQ(2, false_instrs.size());
 
-  auto buffers = RunBufferAssignment(module);
+  auto buffers = RunBufferAssignment(module.get());
   ValidateBuffers(conditional_instrs, *buffers);
   ValidateBuffers(true_instrs, *buffers);
   ValidateBuffers(false_instrs, *buffers);
@@ -941,9 +941,9 @@ TEST_F(BufferAssignmentTest, UnaryOpReuseChain) {
   auto neg = builder.AddInstruction(
       HloInstruction::CreateUnary(f32vec100_, HloOpcode::kNegate, exp2));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
-  auto assignment = RunBufferAssignment(module);
+  auto assignment = RunBufferAssignment(module.get());
 
   // tanh and exp2 can reuse exp1's buffer
   EXPECT_TRUE(assignment->HasTopLevelAllocation(exp1));
@@ -970,9 +970,9 @@ TEST_F(BufferAssignmentTest, ReuseNonOperandBuffer) {
   auto broadcast = builder.AddInstruction(
       HloInstruction::CreateBroadcast(f32a100x10_, slice, {1}));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
-  auto assignment = RunBufferAssignment(module);
+  auto assignment = RunBufferAssignment(module.get());
 
   // negate and broadcast should share a buffer.
   EXPECT_TRUE(assignment->HasTopLevelAllocation(broadcast));
@@ -1003,9 +1003,9 @@ TEST_F(BufferAssignmentTest, NoReuseLiveBuffer) {
       HloInstruction::CreateBroadcast(f32a100x10_, slice, {1}));
   builder.AddInstruction(HloInstruction::CreateTuple({negate, broadcast}));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
-  auto assignment = RunBufferAssignment(module);
+  auto assignment = RunBufferAssignment(module.get());
 
   // The instructions should not share buffers.
   EXPECT_NE(GetTopLevelAllocation(*assignment, broadcast),
@@ -1040,9 +1040,9 @@ TEST_F(BufferAssignmentTest, NoReuseAliasedBuffer) {
       HloInstruction::CreateBroadcast(f32a100x10_, slice, {1}));
   builder.AddInstruction(HloInstruction::CreateTuple({tuple, broadcast}));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
-  auto assignment = RunBufferAssignment(module);
+  auto assignment = RunBufferAssignment(module.get());
 
   // The instructions should not share buffers.
   EXPECT_NE(GetTopLevelAllocation(*assignment, broadcast),
@@ -1075,9 +1075,9 @@ TEST_F(BufferAssignmentTest, DoNotReuseOversizedOutputBuffer) {
   auto broadcast = builder.AddInstruction(HloInstruction::CreateBroadcast(
       ShapeUtil::MakeShape(F32, {10, 4}), slice, {0}));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
-  auto assignment = RunBufferAssignment(module);
+  auto assignment = RunBufferAssignment(module.get());
 
   // The broadcast output buffer cannot be shared.
   EXPECT_NE(GetTopLevelAllocation(*assignment, broadcast),
@@ -1107,9 +1107,9 @@ TEST_F(BufferAssignmentTest, ReuseOutputBufferIfExactlySized) {
   auto broadcast = builder.AddInstruction(HloInstruction::CreateBroadcast(
       ShapeUtil::MakeShape(F32, {10, 10}), slice, {0}));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
-  auto assignment = RunBufferAssignment(module);
+  auto assignment = RunBufferAssignment(module.get());
 
   // negate and broadcast should share a buffer.
   EXPECT_TRUE(assignment->HasTopLevelAllocation(broadcast));
@@ -1145,9 +1145,9 @@ TEST_F(BufferAssignmentTest, DoNotReuseOversizedOutputBufferInTuple) {
       ShapeUtil::MakeShape(F32, {10, 4}), slice, {0}));
   builder.AddInstruction(HloInstruction::CreateTuple({broadcast}));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
-  auto assignment = RunBufferAssignment(module);
+  auto assignment = RunBufferAssignment(module.get());
 
   // The broadcast output buffer cannot be shared.
   EXPECT_NE(GetTopLevelAllocation(*assignment, broadcast),
@@ -1160,7 +1160,7 @@ TEST_F(BufferAssignmentTest, EmbeddedComputationBuffers) {
   // Verify that buffers for embedded computations are properly marked as
   // thread-local and that embedded parameters are not marked as
   // is_entry_computation_parameter.
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto vec_shape = ShapeUtil::MakeShape(F32, {42});
   auto scalar_shape = ShapeUtil::MakeShape(F32, {});
 
@@ -1191,7 +1191,7 @@ TEST_F(BufferAssignmentTest, EmbeddedComputationBuffers) {
       HloInstruction::CreateMap(vec_shape, {call}, map_computation));
   module->AddEntryComputation(builder.Build());
 
-  auto assignment = RunBufferAssignment(module);
+  auto assignment = RunBufferAssignment(module.get());
 
   // Allocations for the map computation should be thread-local and not
   // live-out.
@@ -1238,9 +1238,9 @@ TEST_F(BufferAssignmentTest, TupleParameterAsOutput) {
                                  ShapeUtil::MakeShape(S32, {42})}),
       "param0"));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
-  auto assignment = RunBufferAssignment(module);
+  auto assignment = RunBufferAssignment(module.get());
 
   // There should be four allocations: one for vector of pointers, and one for
   // each tuple element.
@@ -1274,9 +1274,9 @@ TEST_F(BufferAssignmentTest, ElementOfNestedTupleParameterAsOutput) {
       builder.AddInstruction(HloInstruction::CreateGetTupleElement(
           ShapeUtil::GetSubshape(tuple_param->shape(), {1}), tuple_param, 1));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
-  auto assignment = RunBufferAssignment(module);
+  auto assignment = RunBufferAssignment(module.get());
 
   // Only some of the elements of the input param are liveout.
   EXPECT_FALSE(
@@ -1318,9 +1318,9 @@ TEST_F(BufferAssignmentTest, TupleConstantAsOutput) {
   builder.AddInstruction(HloInstruction::CreateConstant(
       LiteralUtil::MakeTuple({&elements[0], &elements[1]})));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
-  auto assignment = RunBufferAssignment(module);
+  auto assignment = RunBufferAssignment(module.get());
 
   EXPECT_EQ(3, assignment->Allocations().size());
 }
@@ -1332,9 +1332,9 @@ TEST_F(BufferAssignmentTest, TupleCustomCallAsOutput) {
       ShapeUtil::MakeTupleShape({ShapeUtil::MakeShape(PRED, {1, 2, 3, 4}),
                                  ShapeUtil::MakeShape(S32, {101})}),
       /*operands=*/{}, /*custom_call_target=*/"foo_function"));
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
-  auto assignment = RunBufferAssignment(module);
+  auto assignment = RunBufferAssignment(module.get());
 
   EXPECT_EQ(3, assignment->Allocations().size());
   EXPECT_TRUE(
@@ -1347,7 +1347,7 @@ TEST_F(BufferAssignmentTest, TupleCustomCallAsOutput) {
 
 TEST_F(BufferAssignmentTest, TupleCallAsOutput) {
   // Test a computation which returns a tuple call value.
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto elem_shape = f32vec4_;
   auto tuple_shape = ShapeUtil::MakeTupleShape({elem_shape});
 
@@ -1365,7 +1365,7 @@ TEST_F(BufferAssignmentTest, TupleCallAsOutput) {
       HloInstruction::CreateCall(tuple_shape, {param}, sub_computation));
   module->AddEntryComputation(builder.Build());
 
-  auto assignment = RunBufferAssignment(module);
+  auto assignment = RunBufferAssignment(module.get());
 
   EXPECT_EQ(2, assignment->Allocations().size());
   // Buffers for call are colocated with the sub-computation.
@@ -1388,7 +1388,7 @@ TEST_F(BufferAssignmentTest, TupleChainedCallAsOutput) {
   // B: call(C, param)
   // C: call(D, param)
   // D: param
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto elem_shape = f32vec4_;
   auto tuple_shape = ShapeUtil::MakeTupleShape({elem_shape});
 
@@ -1427,7 +1427,7 @@ TEST_F(BufferAssignmentTest, TupleChainedCallAsOutput) {
   module->AddEntryComputation(std::move(a_computation));
   module->AddEmbeddedComputation(std::move(b_computation));
 
-  auto assignment = RunBufferAssignment(module);
+  auto assignment = RunBufferAssignment(module.get());
 
   // Buffers for call are colocated with the sub-computations.
   EXPECT_EQ(GetAllocation(*assignment, a_call, /*index=*/{}),
@@ -1461,9 +1461,9 @@ TEST_F(BufferAssignmentTest, BitcastAsOutput) {
   auto bitcast = builder.AddInstruction(
       HloInstruction::CreateUnary(param->shape(), HloOpcode::kBitcast, param));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
-  auto assignment = RunBufferAssignment(module);
+  auto assignment = RunBufferAssignment(module.get());
 
   // Bitcast should get the same allocation as the param.
   EXPECT_EQ(1, assignment->Allocations().size());
@@ -1488,9 +1488,9 @@ TEST_F(BufferAssignmentTest, AmbiguousBufferAsOutput) {
       HloInstruction::CreateTernary(tuple_shape, HloOpcode::kTupleSelect,
                                     pred_param, tuple_param0, tuple_param1));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
-  auto assignment = RunBufferAssignment(module);
+  auto assignment = RunBufferAssignment(module.get());
 
   // Select shallow copies one of its operands so it defines its own top-level
   // buffer and receives its own allocation.
@@ -1526,9 +1526,9 @@ TEST_F(BufferAssignmentTest, TupleBufferNotReused) {
   auto copy = builder.AddInstruction(HloInstruction::CreateUnary(
       scalar_shape, HloOpcode::kCopy, tuple_element));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
-  auto assignment = RunBufferAssignment(module);
+  auto assignment = RunBufferAssignment(module.get());
 
   // There should be no buffer reuse. The copy should not reuse the tuple
   // buffer.
@@ -1568,9 +1568,9 @@ TEST_F(BufferAssignmentTest, OneTempAllocation) {
       HloInstruction::CreateConcatenate(shape_5x4, {dot_ab, dot_bc}, 0));
 
   // Run buffer assignment with alignment=1.
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
-  auto assignment = RunBufferAssignment(module, /*alignment=*/1);
+  auto assignment = RunBufferAssignment(module.get(), /*alignment=*/1);
 
   // There are 5 allocations: 3 parameters, 1 output, and 1 temp.
   EXPECT_EQ(5, assignment->Allocations().size());
@@ -1589,7 +1589,7 @@ TEST_F(BufferAssignmentTest, OneTempAllocation) {
   EXPECT_EQ(80, slice_bc.allocation()->size());
 
   // Re-run buffer assignment with alignment=64.
-  assignment = RunBufferAssignment(module, /*alignment=*/64);
+  assignment = RunBufferAssignment(module.get(), /*alignment=*/64);
   EXPECT_EQ(5, assignment->Allocations().size());
   slice_ab = assignment->GetUniqueTopLevelSlice(dot_ab).ConsumeValueOrDie();
   slice_bc = assignment->GetUniqueTopLevelSlice(dot_bc).ConsumeValueOrDie();
@@ -1632,10 +1632,10 @@ TEST_F(BufferAssignmentTest, TrivialPeakBuffers) {
       HloInstruction::CreateBinary(f32vec100_, HloOpcode::kAdd, mul, param1));
   builder.AddInstruction(HloInstruction::CreateBinary(
       f32vec100_, HloOpcode::kSubtract, add, param1));
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
-  auto buffers = RunBufferAssignment(module);
+  auto buffers = RunBufferAssignment(module.get());
 
   const BufferAllocation& mul_buffer = GetTopLevelAllocation(*buffers, mul);
   const std::vector<const LogicalBuffer*>& peak_buffers =
@@ -1673,11 +1673,11 @@ TEST_F(BufferAssignmentTest, PeakBuffers) {
 
       ShapeUtil::MakeShape(F32, {1}), concat, {0}, {1}, {1}));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   auto buffers = RunBufferAssignmentWithInstructionSequence(
-      module, {param, log, rev, neg, concat, root});
+      module.get(), {param, log, rev, neg, concat, root});
 
   // The temporary buffer should hold the 4 interior instructions.
   const BufferAllocation& buffer = GetTopLevelAllocation(*buffers, concat);
@@ -1698,7 +1698,7 @@ TEST_F(BufferAssignmentTest, PeakBuffers) {
 }
 
 TEST_F(BufferAssignmentTest, PeakBuffersWhile) {
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   const Shape shape = ShapeUtil::MakeShape(F32, {123, 123});
   HloComputation* condition;
   {
@@ -1733,7 +1733,7 @@ TEST_F(BufferAssignmentTest, PeakBuffersWhile) {
       ShapeUtil::MakeShape(F32, {123, 123, 123}), bcast, {0}));
   module->AddEntryComputation(builder.Build());
 
-  auto buffers = RunBufferAssignment(module);
+  auto buffers = RunBufferAssignment(module.get());
   const BufferAllocation& buffer = GetTopLevelAllocation(*buffers, bcast);
   const std::vector<const LogicalBuffer*>& peak_buffers =
       buffer.PeakMemoryLogicalBuffers();
@@ -1783,13 +1783,13 @@ ENTRY main {
 }
 )";
 
-  ParseAndVerifyModule(hlo_text);
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(hlo_text));
   HloInstruction* constant_1 =
-      module().entry_computation()->GetInstructionWithName("constant.1.1");
+      m->entry_computation()->GetInstructionWithName("constant.1.1");
   HloInstruction* constant_2 =
-      module().entry_computation()->GetInstructionWithName("constant.1.2");
+      m->entry_computation()->GetInstructionWithName("constant.1.2");
 
-  auto buffers = RunBufferAssignment(&module());
+  auto buffers = RunBufferAssignment(m.get());
 
   {
     const BufferAllocation& allocation_for_const_1 =
@@ -1818,7 +1818,7 @@ ENTRY main {
   }
 }
 
-class WhileBufferAssignmentTest : public HloVerifiedTestBase {
+class WhileBufferAssignmentTest : public HloTestBase {
  protected:
   std::unique_ptr<HloComputation> BuildWhileConditionComputation(
       const string& name) {
@@ -1878,7 +1878,7 @@ static void RunCopyInsertion(HloModule* module) {
 }
 
 TEST_F(WhileBufferAssignmentTest, TwoForwardWhileLoops) {
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto builder = HloComputation::Builder("entry");
 
   auto input0 = builder.AddInstruction(
@@ -1917,8 +1917,8 @@ TEST_F(WhileBufferAssignmentTest, TwoForwardWhileLoops) {
       HloInstruction::CreateWhile(loop_state_shape_, cond1, body1, tuple1));
 
   module->AddEntryComputation(builder.Build());
-  RunCopyInsertion(module);
-  auto assignment = RunBufferAssignment(module);
+  RunCopyInsertion(module.get());
+  auto assignment = RunBufferAssignment(module.get());
 
   // Verify 'input0' and read-only use while0{0} alias.
   EXPECT_EQ(assignment->GetUniqueSlice(input0, {}).ConsumeValueOrDie(),
@@ -1974,20 +1974,19 @@ ENTRY %test_module {
   ROOT %bcast = s32[1024,1024]{1,0} broadcast(s32[] %while.1), dimensions={}
 })";
 
-  ParseAndVerifyModule(module_str);
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(module_str));
 
   // Run CopyInsertion and check if the graph constructed above doesn't need
   // any copies inserted for BufferAssignment to run.
-  int64 instruction_count = module().instruction_count();
+  int64 instruction_count = m->instruction_count();
   CopyInsertion copy_insertion;
-  ASSERT_IS_OK(copy_insertion.Run(&module()).status());
-  ASSERT_EQ(instruction_count, module().instruction_count());
+  ASSERT_IS_OK(copy_insertion.Run(m.get()).status());
+  ASSERT_EQ(instruction_count, m->instruction_count());
 
   // Get the instructions in the module.
-  const HloInstruction* bcast =
-      module().entry_computation()->root_instruction();
+  const HloInstruction* bcast = m->entry_computation()->root_instruction();
   const HloInstruction* param =
-      module().entry_computation()->parameter_instruction(0);
+      m->entry_computation()->parameter_instruction(0);
   ASSERT_EQ(bcast->opcode(), HloOpcode::kBroadcast);
   const HloInstruction* while1 = bcast->operand(0);
   ASSERT_EQ(while1->opcode(), HloOpcode::kWhile);
@@ -1995,7 +1994,7 @@ ENTRY %test_module {
   ASSERT_EQ(while0->opcode(), HloOpcode::kWhile);
 
   // Run buffer assignment.
-  auto assignment = RunBufferAssignment(&module());
+  auto assignment = RunBufferAssignment(m.get());
   TF_ASSERT_OK_AND_ASSIGN(auto slice_param,
                           assignment->GetUniqueSlice(param, {}));
   TF_ASSERT_OK_AND_ASSIGN(auto slice_while0,
@@ -2042,20 +2041,19 @@ ENTRY %test_module {
   ROOT %bcast = s32[1024,1024]{1,0} broadcast(s32[] %while.1), dimensions={}
 })";
 
-  ParseAndVerifyModule(module_str);
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(module_str));
 
   // Run CopyInsertion and check if the graph constructed above doesn't need
   // any copies inserted for BufferAssignment to run.
-  int64 instruction_count = module().instruction_count();
+  int64 instruction_count = m->instruction_count();
   CopyInsertion copy_insertion;
-  ASSERT_IS_OK(copy_insertion.Run(&module()).status());
-  ASSERT_EQ(instruction_count, module().instruction_count());
+  ASSERT_IS_OK(copy_insertion.Run(m.get()).status());
+  ASSERT_EQ(instruction_count, m->instruction_count());
 
   // Get the instructions in the module.
-  const HloInstruction* bcast =
-      module().entry_computation()->root_instruction();
+  const HloInstruction* bcast = m->entry_computation()->root_instruction();
   const HloInstruction* constant =
-      module().entry_computation()->GetInstructionWithName("constant.42");
+      m->entry_computation()->GetInstructionWithName("constant.42");
   ASSERT_EQ(bcast->opcode(), HloOpcode::kBroadcast);
   const HloInstruction* while1 = bcast->operand(0);
   ASSERT_EQ(while1->opcode(), HloOpcode::kWhile);
@@ -2063,7 +2061,7 @@ ENTRY %test_module {
   ASSERT_EQ(while0->opcode(), HloOpcode::kWhile);
 
   // Run buffer assignment.
-  auto assignment = RunBufferAssignment(&module());
+  auto assignment = RunBufferAssignment(m.get());
   TF_ASSERT_OK_AND_ASSIGN(auto slice_constant,
                           assignment->GetUniqueSlice(constant, {}));
   TF_ASSERT_OK_AND_ASSIGN(auto slice_while0,
@@ -2121,7 +2119,7 @@ TEST_F(WhileBufferAssignmentTest, ColocatedBuffers) {
   };
 
   // Build the entry computation as described in the comment above.
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto builder = HloComputation::Builder("entry");
 
   auto token = builder.AddInstruction(HloInstruction::CreateToken());
@@ -2156,7 +2154,7 @@ TEST_F(WhileBufferAssignmentTest, ColocatedBuffers) {
   // any copies inserted for BufferAssignment to run.
   int64 instruction_count = module->instruction_count();
   CopyInsertion copy_insertion;
-  ASSERT_IS_OK(copy_insertion.Run(module).status());
+  ASSERT_IS_OK(copy_insertion.Run(module.get()).status());
   ASSERT_EQ(instruction_count, module->instruction_count());
 
   // Create a sequential order among all the instructions in the entry
@@ -2175,12 +2173,12 @@ TEST_F(WhileBufferAssignmentTest, ColocatedBuffers) {
 
   TF_ASSERT_OK_AND_ASSIGN(
       auto assignment,
-      BufferAssigner::Run(module,
-                          absl::make_unique<SequentialHloOrdering>(schedule),
-                          backend().compiler()->BufferSizeBytesFunction(),
-                          [](LogicalBuffer::Color) { return 1; },
-                          /*allow_input_output_aliasing=*/false,
-                          /*allocate_buffers_for_constants=*/true));
+      BufferAssigner::Run(
+          module.get(), absl::make_unique<SequentialHloOrdering>(schedule),
+          backend().compiler()->BufferSizeBytesFunction(),
+          [](LogicalBuffer::Color) { return 1; },
+          /*allow_input_output_aliasing=*/false,
+          /*allocate_buffers_for_constants=*/true));
 
   // The result tuple elements must be assigned with different buffers.
   TF_ASSERT_OK_AND_ASSIGN(auto slice0, assignment->GetUniqueSlice(tuple, {0}));
@@ -2202,7 +2200,7 @@ TEST_F(WhileBufferAssignmentTest, ColocatedBuffers) {
 }
 
 TEST_F(WhileBufferAssignmentTest, OneForwardBackwardWhileLoopSet) {
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto builder = HloComputation::Builder("entry");
 
   auto input0 = builder.AddInstruction(
@@ -2234,8 +2232,8 @@ TEST_F(WhileBufferAssignmentTest, OneForwardBackwardWhileLoopSet) {
       HloInstruction::CreateWhile(loop_state_shape_, cond1, body1, while0));
 
   module->AddEntryComputation(builder.Build());
-  RunCopyInsertion(module);
-  auto assignment = RunBufferAssignment(module);
+  RunCopyInsertion(module.get());
+  auto assignment = RunBufferAssignment(module.get());
 
   // while0 and while1 buffers should be completely aligned.
   EXPECT_EQ(assignment->GetUniqueSlice(while0, {0}).ConsumeValueOrDie(),
@@ -2247,7 +2245,7 @@ TEST_F(WhileBufferAssignmentTest, OneForwardBackwardWhileLoopSet) {
 }
 
 TEST_F(BufferAssignmentTest, TwoCalls) {
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   Shape r0f32 = ShapeUtil::MakeShape(xla::F32, {});
   HloComputation* sub_computation;
   {
@@ -2277,13 +2275,13 @@ TEST_F(BufferAssignmentTest, TwoCalls) {
 
   {
     FlattenCallGraph flatten;
-    TF_ASSERT_OK_AND_ASSIGN(bool result, flatten.Run(module));
+    TF_ASSERT_OK_AND_ASSIGN(bool result, flatten.Run(module.get()));
     EXPECT_TRUE(result);
-    std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module);
+    std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module.get());
   }
 
-  RunCopyInsertion(module);
-  auto assignment = RunBufferAssignment(module);
+  RunCopyInsertion(module.get());
+  auto assignment = RunBufferAssignment(module.get());
 
   EXPECT_TRUE(BuffersDistinct({call1}, {call2}, *assignment));
 }
@@ -2308,13 +2306,14 @@ ENTRY Main {
 )";
 
   HloModuleConfig config;
-  config.set_debug_options(legacy_flags::GetDebugOptionsFromFlags());
-  ParseAndVerifyModule(hlo_text, config);
+  config.set_debug_options(GetDebugOptionsFromFlags());
+  TF_ASSERT_OK_AND_ASSIGN(auto m,
+                          ParseAndReturnVerifiedModule(hlo_text, config));
 
-  auto buffers = RunBufferAssignment(&module());
+  auto buffers = RunBufferAssignment(m.get());
 
-  HloComputation* main = module().entry_computation();
-  HloComputation* callee = module().GetComputationWithName("Callee");
+  HloComputation* main = m->entry_computation();
+  HloComputation* callee = m->GetComputationWithName("Callee");
   EXPECT_NE(callee, nullptr);
 
   HloInstruction* param0 = callee->parameter_instruction(0);
@@ -2338,7 +2337,7 @@ ENTRY Main {
 }
 
 TEST_F(WhileBufferAssignmentTest, WhileLoopsInterferingResultRange) {
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto builder = HloComputation::Builder(TestName());
 
   auto zero = builder.AddInstruction(
@@ -2385,11 +2384,11 @@ TEST_F(WhileBufferAssignmentTest, WhileLoopsInterferingResultRange) {
 
   {
     FlattenCallGraph flatten;
-    TF_ASSERT_OK_AND_ASSIGN(bool result, flatten.Run(module));
+    TF_ASSERT_OK_AND_ASSIGN(bool result, flatten.Run(module.get()));
     EXPECT_TRUE(result);
   }
 
-  RunCopyInsertion(module);
+  RunCopyInsertion(module.get());
 
   HloSchedule schedule =
       ScheduleModule(*module, ByteSizeOf).ConsumeValueOrDie();
@@ -2407,18 +2406,18 @@ TEST_F(WhileBufferAssignmentTest, WhileLoopsInterferingResultRange) {
   TF_ASSERT_OK(schedule.Verify());
 
   auto assignment =
-      BufferAssigner::Run(module,
-                          absl::make_unique<SequentialHloOrdering>(schedule),
-                          ByteSizeOf, [](LogicalBuffer::Color) { return 1; },
-                          /*allow_input_output_aliasing=*/false,
-                          /*allocate_buffers_for_constants=*/true)
+      BufferAssigner::Run(
+          module.get(), absl::make_unique<SequentialHloOrdering>(schedule),
+          ByteSizeOf, [](LogicalBuffer::Color) { return 1; },
+          /*allow_input_output_aliasing=*/false,
+          /*allocate_buffers_for_constants=*/true)
           .ConsumeValueOrDie();
 
   EXPECT_TRUE(BuffersDistinct({while0}, {while1}, *assignment));
 }
 
 TEST_F(WhileBufferAssignmentTest, WhilesDontShareEntryParamIfLiveOut) {
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto builder = HloComputation::Builder("entry");
 
   auto input0 = builder.AddInstruction(
@@ -2462,8 +2461,8 @@ TEST_F(WhileBufferAssignmentTest, WhilesDontShareEntryParamIfLiveOut) {
       HloInstruction::CreateGetTupleElement(data_shape_, while1, 2));
 
   module->AddEntryComputation(builder.Build());
-  RunCopyInsertion(module);
-  auto assignment = RunBufferAssignment(module);
+  RunCopyInsertion(module.get());
+  auto assignment = RunBufferAssignment(module.get());
   // Get BufferAllocation for root instruction.
   auto* root_alloc = assignment->GetUniqueTopLevelSlice(while1_out)
                          .ConsumeValueOrDie()
diff --git a/tensorflow/compiler/xla/service/buffer_liveness_test.cc b/tensorflow/compiler/xla/service/buffer_liveness_test.cc
index 17e50905059..aeee543e843 100644
--- a/tensorflow/compiler/xla/service/buffer_liveness_test.cc
+++ b/tensorflow/compiler/xla/service/buffer_liveness_test.cc
@@ -117,7 +117,7 @@ TEST_F(BufferLivenessTest, ElementwiseChain) {
   auto log = builder.AddInstruction(
       HloInstruction::CreateUnary(vec_, HloOpcode::kLog, exp));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   module->AddEntryComputation(builder.Build());
 
   auto liveness =
@@ -164,7 +164,7 @@ TEST_F(BufferLivenessTest, MultipleEntryParameters_Sequential) {
   auto add = builder.AddInstruction(
       HloInstruction::CreateBinary(vec_, HloOpcode::kAdd, negate, exp));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   HloComputation* entry = module->AddEntryComputation(builder.Build());
 
   HloSchedule schedule(module.get());
@@ -213,7 +213,7 @@ TEST_F(BufferLivenessTest, NonElementwiseOperand) {
   auto reverse =
       builder.AddInstruction(HloInstruction::CreateReverse(vec_, negate, {0}));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   module->AddEntryComputation(builder.Build());
 
   auto liveness =
@@ -247,7 +247,7 @@ TEST_F(BufferLivenessTest, OverlappedBuffers) {
   auto add = builder.AddInstruction(
       HloInstruction::CreateBinary(vec_, HloOpcode::kAdd, negate, exp));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   module->AddEntryComputation(builder.Build());
 
   auto liveness =
@@ -289,7 +289,7 @@ TEST_F(BufferLivenessTest, OverlappedBuffersSequentialOrder) {
   auto add = builder.AddInstruction(
       HloInstruction::CreateBinary(vec_, HloOpcode::kAdd, negate, exp));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   HloSchedule schedule(module.get());
@@ -336,7 +336,7 @@ TEST_F(BufferLivenessTest, RootInstructionIsNotLastInSequentialOrder) {
       HloInstruction::CreateSend(recv_done, token, /*channel_id=*/1));
   auto send_done = builder.AddInstruction(HloInstruction::CreateSendDone(send));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   auto computation = module->AddEntryComputation(builder.Build(add));
 
   HloSchedule schedule(module.get());
@@ -373,7 +373,7 @@ TEST_F(BufferLivenessTest, TupleLiveOut) {
   auto outer_tuple =
       builder.AddInstruction(HloInstruction::CreateTuple({inner_tuple, exp}));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   module->AddEntryComputation(builder.Build());
 
   auto liveness =
@@ -393,7 +393,7 @@ TEST_F(BufferLivenessTest, TupleLiveOut) {
 
 TEST_F(BufferLivenessTest, EmbeddedComputation) {
   // Test MaybeLiveOut and MayInterfere for embedded computation.
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
 
   auto embedded_builder = HloComputation::Builder(TestName() + "_embedded");
   auto embedded_param = embedded_builder.AddInstruction(
@@ -450,7 +450,7 @@ TEST_F(BufferLivenessTest, TupleConstantLiveOut) {
   builder.AddInstruction(HloInstruction::CreateGetTupleElement(
       inner_tuple0.shape(), tuple_constant, 0));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   module->AddEntryComputation(builder.Build());
 
   auto liveness =
@@ -514,7 +514,7 @@ TEST_F(BufferLivenessTest, IndependentTupleElements) {
   auto tuple_root =
       builder.AddInstruction(HloInstruction::CreateTuple({add0, add1}));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   module->AddEntryComputation(BuildDummyComputation());
   module->AddEmbeddedComputation(builder.Build());
 
@@ -576,7 +576,7 @@ TEST_F(BufferLivenessTest, DependentTupleElements) {
   auto tuple_root =
       builder.AddInstruction(HloInstruction::CreateTuple({add0, add1}));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   module->AddEntryComputation(BuildDummyComputation());
   module->AddEmbeddedComputation(builder.Build());
 
@@ -646,7 +646,7 @@ class FusedDynamicUpdateSliceLivenessTest : public BufferLivenessTest {
     builder.AddInstruction(
         HloInstruction::CreateTuple({gte0, dynamic_update_slice}));
     // Build module and get reference to entry computation.
-    auto module = CreateNewModule();
+    auto module = CreateNewUnverifiedModule();
     module->AddEntryComputation(builder.Build());
     auto* computation = module->entry_computation();
     // Create fusion instruction based on number of tuple element 1 users.
@@ -802,7 +802,7 @@ class DynamicUpdateSliceLivenessTest : public BufferLivenessTest {
     auto tuple_root = builder.AddInstruction(
         HloInstruction::CreateTuple({gte0, dynamic_update_slice}));
     // Build module and get reference to entry computation.
-    auto module = CreateNewModule();
+    auto module = CreateNewUnverifiedModule();
     module->AddEntryComputation(BuildDummyComputation());
     module->AddEmbeddedComputation(builder.Build());
     // Run BufferLiveness on 'module'.
diff --git a/tensorflow/compiler/xla/service/call_graph_test.cc b/tensorflow/compiler/xla/service/call_graph_test.cc
index 34f3f914d59..a3ac2568b0f 100644
--- a/tensorflow/compiler/xla/service/call_graph_test.cc
+++ b/tensorflow/compiler/xla/service/call_graph_test.cc
@@ -21,7 +21,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
-#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
@@ -31,7 +31,7 @@ namespace {
 
 using ::testing::UnorderedElementsAre;
 
-class CallGraphTest : public HloVerifiedTestBase {
+class CallGraphTest : public HloTestBase {
  protected:
   // Build and return a trivial computation taking and returning a scalar.
   std::unique_ptr<HloComputation> MakeScalarComputation(
@@ -93,10 +93,10 @@ class CallGraphTest : public HloVerifiedTestBase {
 
 TEST_F(CallGraphTest, SingletonComputation) {
   // Test the call graph of a module with a single computation.
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation* computation =
       module->AddEntryComputation(MakeScalarComputation());
-  std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module);
+  std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module.get());
   EXPECT_EQ(1, call_graph->nodes().size());
   EXPECT_TRUE(call_graph->IsFlattened());
 
@@ -112,13 +112,13 @@ TEST_F(CallGraphTest, SingletonComputation) {
 TEST_F(CallGraphTest, UnreachableComputation) {
   // Test the call graph of a module with an entry computation and an
   // unreachable computation.
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation* entry_computation =
       module->AddEntryComputation(MakeScalarComputation());
   HloComputation* unreachable_computation =
       module->AddEmbeddedComputation(MakeScalarComputation());
 
-  std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module);
+  std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module.get());
   EXPECT_EQ(2, call_graph->nodes().size());
 
   const CallGraphNode& entry_node = call_graph->GetNode(entry_computation);
@@ -134,13 +134,13 @@ TEST_F(CallGraphTest, UnreachableComputation) {
 TEST_F(CallGraphTest, ParallelComputation) {
   // Test a call graph of a module with an entry computation which calls another
   // computation in a parallel context via kMap.
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation* map_computation =
       module->AddEmbeddedComputation(MakeScalarComputation());
   HloComputation* entry_computation = module->AddEntryComputation(
       MakeMappingComputation(map_computation, /*callsites=*/5));
 
-  std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module);
+  std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module.get());
   EXPECT_EQ(2, call_graph->nodes().size());
 
   const CallGraphNode& entry_node = call_graph->GetNode(entry_computation);
@@ -163,13 +163,13 @@ TEST_F(CallGraphTest, ParallelComputation) {
 TEST_F(CallGraphTest, SequentialComputations) {
   // Test a call graph of a module with an entry computation which calls another
   // computation in a sequential context via kCall.
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation* called_computation =
       module->AddEmbeddedComputation(MakeScalarComputation());
   HloComputation* entry_computation = module->AddEntryComputation(
       MakeCallingComputation(called_computation, /*callsites=*/3));
 
-  std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module);
+  std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module.get());
   EXPECT_EQ(2, call_graph->nodes().size());
 
   // The called computation is only called from one other computation, but there
@@ -196,7 +196,7 @@ TEST_F(CallGraphTest, SequentialComputations) {
 TEST_F(CallGraphTest, ContextBothComputations) {
   // Test a call graph of a module with an entry computation which calls another
   // computation in both a parallel and sequential context.
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation* subcomputation =
       module->AddEmbeddedComputation(MakeScalarComputation());
 
@@ -210,7 +210,7 @@ TEST_F(CallGraphTest, ContextBothComputations) {
   HloComputation* entry_computation =
       module->AddEntryComputation(builder.Build());
 
-  std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module);
+  std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module.get());
   EXPECT_EQ(2, call_graph->nodes().size());
 
   EXPECT_FALSE(call_graph->IsFlattened());
@@ -239,7 +239,7 @@ TEST_F(CallGraphTest, ContextBothComputations) {
 
 TEST_F(CallGraphTest, ComputationWithConditional) {
   // Test a call graph of a module with a conditional.
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation* true_computation =
       module->AddEmbeddedComputation(MakeScalarComputation(HloOpcode::kCeil));
   HloComputation* false_computation =
@@ -259,7 +259,7 @@ TEST_F(CallGraphTest, ComputationWithConditional) {
   HloComputation* entry_computation =
       module->AddEntryComputation(builder.Build());
 
-  std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module);
+  std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module.get());
 
   EXPECT_EQ(3, call_graph->nodes().size());
 
@@ -298,7 +298,7 @@ TEST_F(CallGraphTest, ComplexGraph) {
   //    c
   //
   // Calls are made via kCall, kWhile, and kMap instructions.
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation* cond_computation =
       module->AddEmbeddedComputation(MakeConditionComputation());
   HloComputation* c_computation =
@@ -328,7 +328,7 @@ TEST_F(CallGraphTest, ComplexGraph) {
     entry_computation = module->AddEntryComputation(builder.Build());
   }
 
-  std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module);
+  std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module.get());
   EXPECT_EQ(5, call_graph->nodes().size());
   EXPECT_FALSE(call_graph->IsFlattened());
 
@@ -418,7 +418,7 @@ TEST_F(CallGraphTest, ComplexGraphNearestAncestors) {
   //    c
   //
   // Calls are made via kCall, kWhile, and kMap instructions.
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation* cond_computation =
       module->AddEmbeddedComputation(MakeConditionComputation());
   HloComputation* c_computation =
@@ -452,7 +452,7 @@ TEST_F(CallGraphTest, ComplexGraphNearestAncestors) {
     entry_computation = module->AddEntryComputation(builder.Build());
   }
 
-  std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module);
+  std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module.get());
   EXPECT_EQ(5, call_graph->nodes().size());
 
   // Verify NearestAncestorsInSameComputation for various instructions in the
@@ -479,10 +479,10 @@ TEST_F(CallGraphTest, ComplexGraphNearestAncestors) {
 
 TEST_F(CallGraphTest, VisitSingletonComputation) {
   // Test the call graph visitor with a call graph with a single node.
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation* computation =
       module->AddEntryComputation(MakeScalarComputation());
-  std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module);
+  std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module.get());
 
   std::vector<HloComputation*> visited;
   TF_ASSERT_OK(call_graph->VisitNodes([&visited](const CallGraphNode& node) {
@@ -494,12 +494,12 @@ TEST_F(CallGraphTest, VisitSingletonComputation) {
 
 TEST_F(CallGraphTest, VisitUnreachableComputation) {
   // Test the call graph visitor with a call graph with an unreachable node.
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation* entry_computation =
       module->AddEntryComputation(MakeScalarComputation());
   HloComputation* unreachable_computation =
       module->AddEmbeddedComputation(MakeScalarComputation());
-  std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module);
+  std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module.get());
 
   // Test visitation of only reachable nodes.
   {
@@ -531,9 +531,9 @@ TEST_F(CallGraphTest, VisitUnreachableComputation) {
 
 TEST_F(CallGraphTest, VisitWithError) {
   // Test that the call graph visitor properly propagates errors.
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(MakeScalarComputation());
-  std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module);
+  std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module.get());
 
   Status status = call_graph->VisitNodes(
       [](const CallGraphNode&) { return InternalError("Visitation failed"); });
diff --git a/tensorflow/compiler/xla/service/call_inliner_test.cc b/tensorflow/compiler/xla/service/call_inliner_test.cc
index e6b56654359..0b6e323f75c 100644
--- a/tensorflow/compiler/xla/service/call_inliner_test.cc
+++ b/tensorflow/compiler/xla/service/call_inliner_test.cc
@@ -28,7 +28,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_pass_fix.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
-#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
@@ -40,7 +40,7 @@ namespace {
 
 // Tests for call inlining that are most tractable at the HLO level (vs
 // ComputationBuilder API in call_test.cc).
-using CallInlinerTest = HloVerifiedTestBase;
+using CallInlinerTest = HloTestBase;
 
 TEST_F(CallInlinerTest, ControlDependenciesAreCarriedToCaller) {
   // "inner" computation just has a control dependency from the "zero" value to
@@ -51,7 +51,7 @@ TEST_F(CallInlinerTest, ControlDependenciesAreCarriedToCaller) {
   HloInstruction* one = inner.AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(42.0f)));
   TF_ASSERT_OK(zero->AddControlDependencyTo(one));
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation* inner_computation =
       module->AddEmbeddedComputation(inner.Build());
 
@@ -64,7 +64,7 @@ TEST_F(CallInlinerTest, ControlDependenciesAreCarriedToCaller) {
   auto computation = module->AddEntryComputation(outer.Build());
 
   CallInliner call_inliner;
-  TF_ASSERT_OK_AND_ASSIGN(bool mutated, call_inliner.Run(module));
+  TF_ASSERT_OK_AND_ASSIGN(bool mutated, call_inliner.Run(module.get()));
   ASSERT_TRUE(mutated);
   EXPECT_THAT(computation->root_instruction(), op::Constant());
   EXPECT_EQ(computation->root_instruction()->literal().GetFirstElement<float>(),
@@ -79,7 +79,7 @@ TEST_F(CallInlinerTest, ControlDependenciesAreCarriedToCaller) {
 // returns false should be identical to just returning false).
 TEST_F(CallInlinerTest, CallsWithinWhileBodiesAreInlined) {
   const Shape pred = ShapeUtil::MakeShape(PRED, {});
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
 
   // Create a lambda that calls a function that returns the false predicate.
   // Note we also use this lambda twice by reference, just to make the test a
@@ -107,7 +107,7 @@ TEST_F(CallInlinerTest, CallsWithinWhileBodiesAreInlined) {
   auto computation = module->AddEntryComputation(outer.Build());
 
   CallInliner call_inliner;
-  TF_ASSERT_OK_AND_ASSIGN(bool mutated, call_inliner.Run(module));
+  TF_ASSERT_OK_AND_ASSIGN(bool mutated, call_inliner.Run(module.get()));
   ASSERT_TRUE(mutated);
   EXPECT_THAT(
       computation->root_instruction()->while_condition()->root_instruction(),
@@ -120,7 +120,7 @@ TEST_F(CallInlinerTest, CallsWithinWhileBodiesAreInlined) {
 // whole pass.
 TEST_F(CallInlinerTest, InlineWithoutRunningPass) {
   const Shape pred = ShapeUtil::MakeShape(PRED, {});
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
 
   HloComputation::Builder just_false(TestName() + ".false");
   auto* true_constant = just_false.AddInstruction(
@@ -144,7 +144,7 @@ TEST_F(CallInlinerTest, InlineWithoutRunningPass) {
 
 TEST_F(CallInlinerTest, CallToOutfeedComputationIsInlined) {
   const Shape f32 = ShapeUtil::MakeShape(F32, {});
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
 
   HloComputation::Builder outfeeder(TestName() + ".outfeeder");
   auto value = outfeeder.AddInstruction(
@@ -163,7 +163,7 @@ TEST_F(CallInlinerTest, CallToOutfeedComputationIsInlined) {
   module->AddEntryComputation(outer.Build());
 
   CallInliner call_inliner;
-  TF_ASSERT_OK_AND_ASSIGN(bool mutated, call_inliner.Run(module));
+  TF_ASSERT_OK_AND_ASSIGN(bool mutated, call_inliner.Run(module.get()));
   ASSERT_TRUE(mutated);
 }
 
diff --git a/tensorflow/compiler/xla/service/compilation_cache.cc b/tensorflow/compiler/xla/service/compilation_cache.cc
new file mode 100644
index 00000000000..2662fe46705
--- /dev/null
+++ b/tensorflow/compiler/xla/service/compilation_cache.cc
@@ -0,0 +1,70 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/compilation_cache.h"
+
+#include <utility>
+
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace xla {
+
+namespace {
+
+int64 GetUniqueId() {
+  static tensorflow::mutex mu(tensorflow::LINKER_INITIALIZED);
+  static int64 counter = 0;
+  tensorflow::mutex_lock loc(mu);
+  const int64 id = counter++;
+  return id;
+}
+
+}  // namespace
+
+ExecutionHandle CompilationCache::Insert(
+    std::unique_ptr<Executable> executable) {
+  tensorflow::mutex_lock lock(mutex_);
+
+  CacheKey key = GetUniqueId();
+  VLOG(2) << "inserting cache key: " << key;
+  CHECK_EQ(cache_.count(key), 0);
+  cache_.emplace(key, std::move(executable));
+
+  ExecutionHandle handle;
+  handle.set_handle(key);
+  return handle;
+}
+
+StatusOr<std::shared_ptr<Executable>> CompilationCache::LookUp(
+    const ExecutionHandle& handle) const {
+  tensorflow::mutex_lock lock(mutex_);
+
+  CacheKey key = handle.handle();
+  VLOG(2) << "looking up cache key: " << key;
+  if (cache_.count(key) == 0) {
+    VLOG(2) << "cache key not found: " << key;
+    return InvalidArgumentStrCat("can not find executable with handle ", key);
+  } else {
+    auto& result = cache_.at(key);
+    VLOG(2) << "hit executable: " << result->module().name();
+    return result;
+  }
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/compilation_cache.h b/tensorflow/compiler/xla/service/compilation_cache.h
new file mode 100644
index 00000000000..5f94def509d
--- /dev/null
+++ b/tensorflow/compiler/xla/service/compilation_cache.h
@@ -0,0 +1,62 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_COMPILATION_CACHE_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_COMPILATION_CACHE_H_
+
+#include <map>
+#include <memory>
+#include <string>
+
+#include "absl/container/flat_hash_map.h"
+#include "tensorflow/compiler/xla/service/executable.h"
+#include "tensorflow/compiler/xla/service/hlo_module_config.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+
+namespace xla {
+
+// A cache which stores Executables indexed by computation handle and version.
+//
+// TODO(b/119042872): Provide mechanism for removing computations from the
+// compilation cache.
+class CompilationCache {
+ public:
+  CompilationCache() {}
+
+  ExecutionHandle Insert(std::unique_ptr<Executable> executable);
+
+  // Lookup the Executable for the specified handle in the cache. Return a
+  // shared_ptr to the Executable if it exists in the cache.
+  StatusOr<std::shared_ptr<Executable>> LookUp(
+      const ExecutionHandle& handle) const;
+
+ protected:
+  mutable tensorflow::mutex mutex_;
+
+  using CacheKey = int64;
+
+  absl::flat_hash_map<CacheKey, std::shared_ptr<Executable>> cache_
+      GUARDED_BY(mutex_);
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(CompilationCache);
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_COMPILATION_CACHE_H_
diff --git a/tensorflow/compiler/xla/service/compile_only_service.cc b/tensorflow/compiler/xla/service/compile_only_service.cc
index 6d67f970020..67132274c0d 100644
--- a/tensorflow/compiler/xla/service/compile_only_service.cc
+++ b/tensorflow/compiler/xla/service/compile_only_service.cc
@@ -20,7 +20,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/strings/str_cat.h"
-#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
+#include "tensorflow/compiler/xla/debug_options_flags.h"
 #include "tensorflow/compiler/xla/service/backend.h"
 #include "tensorflow/compiler/xla/service/computation_layout.h"
 #include "tensorflow/compiler/xla/service/platform_util.h"
diff --git a/tensorflow/compiler/xla/service/compiler.cc b/tensorflow/compiler/xla/service/compiler.cc
index 80c630c6201..8f08c244908 100644
--- a/tensorflow/compiler/xla/service/compiler.cc
+++ b/tensorflow/compiler/xla/service/compiler.cc
@@ -110,6 +110,6 @@ Compiler::GetPlatformCompilers() {
 }
 
 AotCompilationOptions::AotCompilationOptions()
-    : debug_options_(legacy_flags::GetDebugOptionsFromFlags()) {}
+    : debug_options_(GetDebugOptionsFromFlags()) {}
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/conditional_simplifier_test.cc b/tensorflow/compiler/xla/service/conditional_simplifier_test.cc
index c43a31b167d..289eb6d9023 100644
--- a/tensorflow/compiler/xla/service/conditional_simplifier_test.cc
+++ b/tensorflow/compiler/xla/service/conditional_simplifier_test.cc
@@ -25,7 +25,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
-#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/status.h"
@@ -37,7 +37,7 @@ namespace {
 
 namespace op = xla::testing::opcode_matchers;
 
-class ConditionalSimplifierTest : public HloVerifiedTestBase {
+class ConditionalSimplifierTest : public HloTestBase {
  public:
   // Makes a computation that contains a conditional with constant predicate.
   HloComputation* MakeConditional(HloModule* module);
@@ -96,25 +96,28 @@ HloComputation* ConditionalSimplifierTest::MakeConditional(HloModule* module) {
 }
 
 TEST_F(ConditionalSimplifierTest, ConditionalGetsInlined) {
-  HloComputation* computation = MakeConditional(&module());
-  ASSERT_TRUE(ConditionalSimplifier().Run(&module()).ValueOrDie());
+  auto m = CreateNewVerifiedModule();
+  HloComputation* computation = MakeConditional(m.get());
+  ASSERT_TRUE(ConditionalSimplifier().Run(m.get()).ValueOrDie());
   EXPECT_THAT(computation->root_instruction(),
               op::Add(op::Parameter(), op::Constant()));
 }
 
 TEST_F(ConditionalSimplifierTest, ConditionalWithControlDependency) {
-  HloComputation* computation = MakeConditional(&module());
+  auto m = CreateNewVerifiedModule();
+  HloComputation* computation = MakeConditional(m.get());
 
   auto* true_op = computation->AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::CreateR0<bool>(true)));
   TF_ASSERT_OK(
       true_op->AddControlDependencyTo(computation->root_instruction()));
 
-  EXPECT_FALSE(ConditionalSimplifier().Run(&module()).ValueOrDie());
+  EXPECT_FALSE(ConditionalSimplifier().Run(m.get()).ValueOrDie());
 }
 
 TEST_F(ConditionalSimplifierTest, NotRemovedIfContainsSend) {
-  HloComputation* computation = MakeConditional(&module());
+  auto m = CreateNewVerifiedModule();
+  HloComputation* computation = MakeConditional(m.get());
   auto* conditional = computation->root_instruction();
   ASSERT_EQ(conditional->opcode(), HloOpcode::kConditional);
 
@@ -125,11 +128,12 @@ TEST_F(ConditionalSimplifierTest, NotRemovedIfContainsSend) {
           HloInstruction::CreateConstant(LiteralUtil::CreateR0<bool>(true))),
       token, /*channel_id=*/0));
   true_computation->AddInstruction(HloInstruction::CreateSendDone(send));
-  EXPECT_FALSE(ConditionalSimplifier().Run(&module()).ValueOrDie());
+  EXPECT_FALSE(ConditionalSimplifier().Run(m.get()).ValueOrDie());
 }
 
 TEST_F(ConditionalSimplifierTest, NotRemovedIfContainsRecv) {
-  HloComputation* computation = MakeConditional(&module());
+  auto m = CreateNewVerifiedModule();
+  HloComputation* computation = MakeConditional(m.get());
   auto* conditional = computation->root_instruction();
   ASSERT_EQ(conditional->opcode(), HloOpcode::kConditional);
 
@@ -138,18 +142,19 @@ TEST_F(ConditionalSimplifierTest, NotRemovedIfContainsRecv) {
   auto* recv = true_computation->AddInstruction(HloInstruction::CreateRecv(
       ShapeUtil::MakeShape(F32, {1}), token, /*channel_id=*/0));
   true_computation->AddInstruction(HloInstruction::CreateRecvDone(recv));
-  EXPECT_FALSE(ConditionalSimplifier().Run(&module()).ValueOrDie());
+  EXPECT_FALSE(ConditionalSimplifier().Run(m.get()).ValueOrDie());
 }
 
 TEST_F(ConditionalSimplifierTest, NotRemovedIfContainsNonRemovableInstruction) {
-  HloComputation* computation = MakeConditional(&module());
+  auto m = CreateNewVerifiedModule();
+  HloComputation* computation = MakeConditional(m.get());
   auto* conditional = computation->root_instruction();
   ASSERT_EQ(conditional->opcode(), HloOpcode::kConditional);
   auto* false_computation = conditional->false_computation();
   auto token = false_computation->AddInstruction(HloInstruction::CreateToken());
   false_computation->AddInstruction(HloInstruction::CreateInfeed(
       ShapeUtil::MakeShape(F32, {1}), token, "config"));
-  EXPECT_FALSE(ConditionalSimplifier().Run(&module()).ValueOrDie());
+  EXPECT_FALSE(ConditionalSimplifier().Run(m.get()).ValueOrDie());
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/xla/service/convolution_feature_group_converter.cc b/tensorflow/compiler/xla/service/convolution_feature_group_converter.cc
index 0ac4a65ec6a..7f7f1503a09 100644
--- a/tensorflow/compiler/xla/service/convolution_feature_group_converter.cc
+++ b/tensorflow/compiler/xla/service/convolution_feature_group_converter.cc
@@ -51,7 +51,8 @@ class ConvolutionVisitor : public DfsHloVisitorWithDefault {
   Status HandleConvolution(HloInstruction* convolution) override;
 
   // Runs the visitor on a computation.
-  static bool Run(HloComputation* computation);
+  static bool Run(HloComputation* computation,
+                  bool canonicalize_depthwise_filter);
 
   // Returns whether any convolution ops were rewritten.
   const bool changed() const { return changed_; }
@@ -59,18 +60,24 @@ class ConvolutionVisitor : public DfsHloVisitorWithDefault {
   ~ConvolutionVisitor() override = default;
 
  private:
-  explicit ConvolutionVisitor(HloComputation* computation)
-      : computation_(computation) {}
+  explicit ConvolutionVisitor(HloComputation* computation,
+                              bool canonicalize_depthwise_filter = false)
+      : computation_(computation),
+        filter_expansion_(!canonicalize_depthwise_filter) {}
 
   // Current HloComputation instance the ConvolutionVisitor is traversing.
   HloComputation* computation_;
 
   // Whether rewrite has occurred.
   bool changed_ = false;
+
+  // Whether filter expansion is required.
+  bool filter_expansion_;
 };
 
-bool ConvolutionVisitor::Run(HloComputation* computation) {
-  ConvolutionVisitor visitor(computation);
+bool ConvolutionVisitor::Run(HloComputation* computation,
+                             bool canonicalize_depthwise_filter) {
+  ConvolutionVisitor visitor(computation, canonicalize_depthwise_filter);
   TF_CHECK_OK(computation->Accept(&visitor));
   return visitor.changed_;
 }
@@ -190,9 +197,49 @@ Status ConvolutionVisitor::HandleConvolution(HloInstruction* convolution) {
   HloInstruction* filter_mask = GetExpandedFilterMask(
       filter->shape(), input_feature_dim, output_feature_dim, group_count, add);
   HloInstruction* expanded_filter;
-  // We want to repeat 'filter' in the 'input_feature_dim' dimension
-  // 'group_count' times.
+
   if (group_size == 1) {
+    bool depthwise_separable =
+        (group_count == filter->shape().dimensions(output_feature_dim));
+    // If the code generator handles depthwise separable convolutions
+    // inherently, then no filter expansion is needed.
+    if (!filter_expansion_ && depthwise_separable) {
+      const int64 old_kernel_input_feature_dimension =
+          dim_numbers.kernel_input_feature_dimension();
+      const int64 old_kernel_output_feature_dimension =
+          dim_numbers.kernel_output_feature_dimension();
+
+      // For depthwise convolutions, we want the kernel input feature dimension
+      // to be smaller than the output feature dimension. If that's not the
+      // case, we swap the dimensions.
+      if (old_kernel_input_feature_dimension >
+          old_kernel_output_feature_dimension) {
+        Shape reshaped_filter_shape = filter->shape();
+        auto& dimensions = *reshaped_filter_shape.mutable_dimensions();
+        std::swap(dimensions[old_kernel_input_feature_dimension],
+                  dimensions[old_kernel_output_feature_dimension]);
+
+        auto reshaped_filter =
+            add(HloInstruction::CreateReshape(reshaped_filter_shape, filter));
+
+        dim_numbers.set_kernel_input_feature_dimension(
+            old_kernel_output_feature_dimension);
+
+        dim_numbers.set_kernel_output_feature_dimension(
+            old_kernel_input_feature_dimension);
+
+        auto new_convolution = HloInstruction::CreateConvolve(
+            convolution->shape(), convolution->mutable_operand(0),
+            reshaped_filter, group_count, convolution->window(), dim_numbers,
+            convolution->precision_config());
+
+        TF_RETURN_IF_ERROR(computation_->ReplaceWithNewInstruction(
+            convolution, std::move(new_convolution)));
+      }
+      return Status::OK();
+    }
+    // We want to repeat 'filter' in the 'input_feature_dim' dimension
+    // 'group_count' times.
     Shape reshaped_filter_shape =
         ShapeUtil::DeleteDimension(input_feature_dim, filter->shape());
     auto reshaped_filter =
@@ -237,7 +284,7 @@ StatusOr<bool> ConvolutionFeatureGroupConverter::Run(HloModule* module) {
                         module->ToString());
   bool changed = false;
   for (auto* comp : module->MakeNonfusionComputations()) {
-    if (ConvolutionVisitor::Run(comp)) {
+    if (ConvolutionVisitor::Run(comp, filter_expansion_)) {
       changed = true;
     }
   }
diff --git a/tensorflow/compiler/xla/service/convolution_feature_group_converter.h b/tensorflow/compiler/xla/service/convolution_feature_group_converter.h
index ce0138e56fb..cb6bc04c00a 100644
--- a/tensorflow/compiler/xla/service/convolution_feature_group_converter.h
+++ b/tensorflow/compiler/xla/service/convolution_feature_group_converter.h
@@ -27,7 +27,8 @@ namespace xla {
 // convolutions with feature_group_count = 1.
 class ConvolutionFeatureGroupConverter : public HloModulePass {
  public:
-  ConvolutionFeatureGroupConverter() {}
+  ConvolutionFeatureGroupConverter(bool canonicalize_depthwise_filter = false)
+      : filter_expansion_(canonicalize_depthwise_filter) {}
 
   absl::string_view name() const override {
     return "convolution-feature-group-converter";
@@ -36,6 +37,9 @@ class ConvolutionFeatureGroupConverter : public HloModulePass {
   // Run convolution rewriting on the given computation. Returns whether the
   // computation was changed.
   StatusOr<bool> Run(HloModule* module) override;
+
+  // Tells whether filter expansion is required.
+  bool filter_expansion_;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/copy_insertion_test.cc b/tensorflow/compiler/xla/service/copy_insertion_test.cc
index 4533ebb99bb..7446bc7cc11 100644
--- a/tensorflow/compiler/xla/service/copy_insertion_test.cc
+++ b/tensorflow/compiler/xla/service/copy_insertion_test.cc
@@ -17,7 +17,7 @@ limitations under the License.
 
 #include <set>
 
-#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
+#include "tensorflow/compiler/xla/debug_options_flags.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
@@ -94,7 +94,7 @@ TEST_F(CopyInsertionTest, SingleParameter) {
 
   EXPECT_THAT(x->users(), UnorderedElementsAre(tuple));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   module->AddEntryComputation(builder.Build());
 
   InsertCopies(module.get());
@@ -114,7 +114,7 @@ TEST_F(CopyInsertionTest, SingleConstant) {
 
   EXPECT_THAT(constant->users(), UnorderedElementsAre(tuple));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   module->AddEntryComputation(builder.Build());
 
   InsertCopies(module.get());
@@ -127,7 +127,7 @@ TEST_F(CopyInsertionTest, SingleConstant) {
 TEST_F(CopyInsertionTest, ExistingCopiesNotRemoved) {
   // Verify that kCopy instructions which change layout and exist before
   // copy-insertion remain in the graph after copy-insertion.
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
 
   auto builder = HloComputation::Builder(TestName());
   HloInstruction* constant =
@@ -181,7 +181,7 @@ TEST_F(CopyInsertionTest, MultipleConstantsAndParameters) {
 
   builder.AddInstruction(HloInstruction::CreateTuple({constant2, x, add}));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   module->AddEntryComputation(builder.Build());
 
   InsertCopies(module.get());
@@ -217,7 +217,7 @@ TEST_F(CopyInsertionTest, AmbiguousPointsToSet) {
   EXPECT_THAT(constant2->users(), UnorderedElementsAre(tuple1, tuple2));
   EXPECT_THAT(constant3->users(), UnorderedElementsAre(tuple2));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   module->AddEntryComputation(builder.Build());
 
   HloInstruction* old_root = module->entry_computation()->root_instruction();
@@ -238,7 +238,7 @@ TEST_F(CopyInsertionTest, BitcastParameter) {
   HloInstruction* bitcast = builder.AddInstruction(HloInstruction::CreateUnary(
       ShapeUtil::MakeShape(F32, {2, 2}), HloOpcode::kBitcast, x));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   module->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(x->users(), UnorderedElementsAre(bitcast));
@@ -261,7 +261,7 @@ TEST_F(CopyInsertionTest, BitcastConstant) {
   HloInstruction* bitcast = builder.AddInstruction(HloInstruction::CreateUnary(
       ShapeUtil::MakeShape(F32, {2, 2}), HloOpcode::kBitcast, constant));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   module->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(constant->users(), UnorderedElementsAre(bitcast));
@@ -283,7 +283,7 @@ TEST_F(CopyInsertionTest, BitcastTupleElementParameter) {
       ShapeUtil::MakeShape(F32, {2, 2}), HloOpcode::kBitcast, x));
   builder.AddInstruction(HloInstruction::CreateTuple({bitcast}));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   module->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(x->users(), UnorderedElementsAre(bitcast));
@@ -310,7 +310,7 @@ TEST_F(CopyInsertionTest, NestedTupleParameter) {
            ShapeUtil::MakeShape(F32, {42})}),
       "param0"));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   module->AddEntryComputation(builder.Build());
 
   EXPECT_EQ(HloOpcode::kParameter,
@@ -351,7 +351,7 @@ TEST_F(CopyInsertionTest, ElementOfNestedTupleParameter) {
   auto gte = builder.AddInstruction(HloInstruction::CreateGetTupleElement(
       ShapeUtil::GetSubshape(param->shape(), {0}), param, 0));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   module->AddEntryComputation(builder.Build());
 
   EXPECT_EQ(gte, module->entry_computation()->root_instruction());
@@ -388,7 +388,7 @@ TEST_F(CopyInsertionTest, AmbiguousTopLevelRoot) {
       builder.AddInstruction(HloInstruction::CreateGetTupleElement(
           ShapeUtil::GetSubshape(select->shape(), {0}), select, 0));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   module->AddEntryComputation(builder.Build());
 
   EXPECT_EQ(gte, module->entry_computation()->root_instruction());
@@ -403,7 +403,7 @@ TEST_F(CopyInsertionTest, AmbiguousTopLevelRoot) {
 
 class WhileCopyInsertionTest : public CopyInsertionTest {
  protected:
-  WhileCopyInsertionTest() : module_(CreateNewModule()) {}
+  WhileCopyInsertionTest() : module_(CreateNewUnverifiedModule()) {}
 
   // Builds a While condition computation which reads the induction variable
   // from the tuple parameter, and returns a predicate indicating whether this
@@ -1295,7 +1295,7 @@ TEST_F(WhileCopyInsertionTest, InitPointsToNonDistinctUsedByTwoWhileLoops) {
 TEST_F(CopyInsertionTest, SwizzlingWhile) {
   // Test a while instruction with a body which permutes its tuple parameter
   // elements.
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   const Shape loop_state_shape =
       ShapeUtil::MakeTupleShape({scalar_shape_, scalar_shape_});
 
@@ -1362,7 +1362,7 @@ TEST_F(CopyInsertionTest, CrossingParameters) {
   //   |  / \ |
   //   | /   \|
   //  (p1  ,  p0)
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   const Shape tuple_shape =
       ShapeUtil::MakeTupleShape({scalar_shape_, scalar_shape_});
 
@@ -1395,7 +1395,7 @@ TEST_F(CopyInsertionTest, ParametersAliasing) {
   //   |      |
   //   |      |
   //  (p0 ,  p1)
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   const Shape tuple_shape =
       ShapeUtil::MakeTupleShape({scalar_shape_, scalar_shape_});
 
@@ -1428,7 +1428,7 @@ TEST_F(CopyInsertionTest, ParameterWithNoAliasing) {
   //   |      |
   //   |      |
   //  (p0 ,  p1)
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   const Shape tuple_shape =
       ShapeUtil::MakeTupleShape({scalar_shape_, scalar_shape_});
 
@@ -1461,7 +1461,7 @@ TEST_F(CopyInsertionTest, ParameterWithPartialAliasing) {
   //   |      |
   //   |      |
   //  (p0 ,  p1)
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   const Shape tuple_shape =
       ShapeUtil::MakeTupleShape({scalar_shape_, scalar_shape_});
 
@@ -1496,7 +1496,7 @@ TEST_F(CopyInsertionTest, ParameterAndParallelOpsWithPartialAliasing) {
   //   |    |      |
   //   |    |      |
   //   +-- (p0 ,  p1)
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   const Shape tuple_shape =
       ShapeUtil::MakeTupleShape({scalar_shape_, scalar_shape_});
 
@@ -1534,7 +1534,7 @@ TEST_F(CopyInsertionTest, ParameterAndOpsWithPartialAliasing) {
   //   |    Add----+
   //   |    |      |
   //   +-- (p0 ,  p1)
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   const Shape tuple_shape =
       ShapeUtil::MakeTupleShape({scalar_shape_, scalar_shape_});
 
@@ -1569,7 +1569,7 @@ TEST_F(CopyInsertionTest, SwizzlingWhileWithOneOp) {
   // the operation (instruction) on the element makes the live range of the
   // respective input and output elements different than if the instruction were
   // not there (as in the SwizzlingWhile test above).
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   const Shape loop_state_shape =
       ShapeUtil::MakeTupleShape({scalar_shape_, scalar_shape_});
 
@@ -1632,7 +1632,7 @@ TEST_F(CopyInsertionTest, SwizzlingWhileSharedInput) {
   // the while body is a single constant (both loop state elements are the same
   // constant). This means no copies are necessary because both loop state
   // elements are the same so interchanging them is a no-op.
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   const Shape loop_state_shape =
       ShapeUtil::MakeTupleShape({scalar_shape_, scalar_shape_});
 
@@ -1693,7 +1693,7 @@ TEST_F(CopyInsertionTest, SequentialWhiles) {
   const Shape loop_state_shape = ShapeUtil::MakeTupleShape(
       {element_shape, element_shape, element_shape, element_shape});
 
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   auto builder = HloComputation::Builder(TestName());
   auto param_0 = builder.AddInstruction(
       HloInstruction::CreateParameter(0, element_shape, "param_0"));
@@ -1783,7 +1783,7 @@ TEST_F(CopyInsertionTest, SequentialWhiles) {
 TEST_F(CopyInsertionTest, WhileBodyWithConstantRoot) {
   // Test a while body and condition which are each simply a constant (root of
   // computation is a constant). The body constant should be copied.
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   auto builder = HloComputation::Builder(TestName());
   auto param_0 = builder.AddInstruction(
       HloInstruction::CreateParameter(0, scalar_shape_, "param_0"));
@@ -1896,7 +1896,7 @@ void BM_SequentialWhiles(int num_iters, int num_whiles) {
   tensorflow::testing::StopTiming();
   for (int i = 0; i < num_iters; ++i) {
     HloModuleConfig config;
-    config.set_debug_options(legacy_flags::GetDebugOptionsFromFlags());
+    config.set_debug_options(GetDebugOptionsFromFlags());
     HloModule module("BM_SequentialWhiles", config);
 
     auto builder = HloComputation::Builder("BM_SequentialWhiles");
@@ -1936,7 +1936,7 @@ void BM_ParallelWhiles(int num_iters, int num_whiles) {
   tensorflow::testing::StopTiming();
   for (int i = 0; i < num_iters; ++i) {
     HloModuleConfig config;
-    config.set_debug_options(legacy_flags::GetDebugOptionsFromFlags());
+    config.set_debug_options(GetDebugOptionsFromFlags());
     HloModule module("BM_SequentialWhiles", config);
 
     auto builder = HloComputation::Builder("BM_ParallelWhiles");
@@ -2003,7 +2003,7 @@ std::unique_ptr<HloComputation> MakeBenchmarkWhileBody(
 void BM_ManyElementTuple(int num_iters, const int num_tuple_inputs) {
   tensorflow::testing::StopTiming();
   HloModuleConfig config;
-  config.set_debug_options(legacy_flags::GetDebugOptionsFromFlags());
+  config.set_debug_options(GetDebugOptionsFromFlags());
   CopyInsertion copy_insertion;
   const Shape element_shape = ShapeUtil::MakeShape(F32, {});
   std::vector<HloInstruction*> tuple_params(num_tuple_inputs);
diff --git a/tensorflow/compiler/xla/service/cpu/BUILD b/tensorflow/compiler/xla/service/cpu/BUILD
index 36e25cbe678..2763d18121a 100644
--- a/tensorflow/compiler/xla/service/cpu/BUILD
+++ b/tensorflow/compiler/xla/service/cpu/BUILD
@@ -824,7 +824,6 @@ tf_cc_test(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
     ],
 )
@@ -846,7 +845,6 @@ tf_cc_test(
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
     ],
 )
@@ -887,7 +885,6 @@ tf_cc_test(
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_matchers",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
         "//tensorflow/compiler/xla/tests:test_utils",
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
@@ -961,17 +958,16 @@ tf_cc_test(
     srcs = ["cpu_copy_insertion_test.cc"],
     deps = [
         ":cpu_copy_insertion",
+        "//tensorflow/compiler/xla:debug_options_flags",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_graph_dumper",
         "//tensorflow/compiler/xla/service:hlo_matchers",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:test",
     ],
@@ -997,7 +993,6 @@ tf_cc_test(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
diff --git a/tensorflow/compiler/xla/service/cpu/conv_canonicalization_test.cc b/tensorflow/compiler/xla/service/cpu/conv_canonicalization_test.cc
index 2083f440fdd..c58175428fe 100644
--- a/tensorflow/compiler/xla/service/cpu/conv_canonicalization_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/conv_canonicalization_test.cc
@@ -22,7 +22,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/test.h"
-#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/util.h"
 
 #include "tensorflow/compiler/xla/test_helpers.h"
@@ -32,7 +32,7 @@ namespace cpu {
 
 using ::testing::ElementsAre;
 
-class ConvCanonicalizationTest : public HloVerifiedTestBase {
+class ConvCanonicalizationTest : public HloTestBase {
  public:
   ConvCanonicalizationTest() {
     for (int i = 0; i < 2; ++i) {
@@ -87,7 +87,7 @@ TEST_F(ConvCanonicalizationTest, NonCanonicalToCanonical) {
       input, kernel, /*feature_group_count=*/1, conv_window_, dnums,
       DefaultPrecisionConfig(2)));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation* entry_computation =
       module->AddEntryComputation(builder.Build());
 
@@ -96,7 +96,7 @@ TEST_F(ConvCanonicalizationTest, NonCanonicalToCanonical) {
         return cpu::TargetMachineFeatures::kEigenExpectedTensorAlignment;
       });
   ConvCanonicalization conv_canonicalization(&target_machine_features);
-  EXPECT_TRUE(conv_canonicalization.Run(module).ValueOrDie());
+  EXPECT_TRUE(conv_canonicalization.Run(module.get()).ValueOrDie());
 
   const HloInstruction* output_reshape = entry_computation->root_instruction();
   EXPECT_EQ(HloOpcode::kTranspose, output_reshape->opcode());
@@ -150,7 +150,7 @@ TEST_F(ConvCanonicalizationTest, CanonicalStaysTheSame) {
       input, kernel, /*feature_group_count=*/1, conv_window_, dnums,
       DefaultPrecisionConfig(2)));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   cpu::TargetMachineFeaturesWithFakeAlignmentLogic target_machine_features(
@@ -158,7 +158,7 @@ TEST_F(ConvCanonicalizationTest, CanonicalStaysTheSame) {
         return cpu::TargetMachineFeatures::kEigenExpectedTensorAlignment;
       });
   ConvCanonicalization conv_canonicalization(&target_machine_features);
-  EXPECT_FALSE(conv_canonicalization.Run(module).ValueOrDie());
+  EXPECT_FALSE(conv_canonicalization.Run(module.get()).ValueOrDie());
 }
 
 }  // namespace cpu
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_copy_insertion_test.cc b/tensorflow/compiler/xla/service/cpu/cpu_copy_insertion_test.cc
index c9fb34be1cd..c085f85fb73 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_copy_insertion_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_copy_insertion_test.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/cpu/cpu_copy_insertion.h"
 
-#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
+#include "tensorflow/compiler/xla/debug_options_flags.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
@@ -25,7 +25,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
-#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/platform/test_benchmark.h"
 
@@ -52,7 +52,7 @@ int64 CountCopies(const HloModule& module) {
   return count;
 }
 
-class CpuCopyInsertionTest : public HloVerifiedTestBase {
+class CpuCopyInsertionTest : public HloTestBase {
  protected:
   void InsertCopies(HloModule* module) {
     CpuCopyInsertion copy_insertion;
@@ -65,7 +65,7 @@ class CpuCopyInsertionTest : public HloVerifiedTestBase {
 TEST_F(CpuCopyInsertionTest, WhileBodyWithConstantRoot) {
   // Test a while body and condition which are each simply a constant (root of
   // computation is a constant). Each constant should be copied.
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto builder = HloComputation::Builder(TestName());
   auto param_0 = builder.AddInstruction(
       HloInstruction::CreateParameter(0, scalar_shape_, "param_0"));
@@ -90,7 +90,7 @@ TEST_F(CpuCopyInsertionTest, WhileBodyWithConstantRoot) {
 
   module->AddEntryComputation(builder.Build());
 
-  InsertCopies(module);
+  InsertCopies(module.get());
 
   EXPECT_EQ(CountCopies(*module), 3);
 
@@ -103,7 +103,7 @@ TEST_F(CpuCopyInsertionTest, TupleCall) {
   // Test a kCall instruction which calls a computation which produces a three
   // element tuple: one is a constant, one is a parameter, and one is produced
   // in the computation. The constant and parameter should be copied.
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto builder = HloComputation::Builder(TestName());
   auto param = builder.AddInstruction(
       HloInstruction::CreateParameter(0, scalar_shape_, "param_0"));
@@ -127,7 +127,7 @@ TEST_F(CpuCopyInsertionTest, TupleCall) {
 
   module->AddEntryComputation(builder.Build());
 
-  InsertCopies(module);
+  InsertCopies(module.get());
 
   EXPECT_EQ(CountCopies(*subcomputation), 2);
   EXPECT_THAT(subcomputation->root_instruction(),
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_hlo_support_checker_test.cc b/tensorflow/compiler/xla/service/cpu/cpu_hlo_support_checker_test.cc
index e6b6fcdf684..9cbfb88834b 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_hlo_support_checker_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_hlo_support_checker_test.cc
@@ -16,7 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/cpu/cpu_hlo_support_checker.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
-#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/core/lib/core/error_codes.pb.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 
@@ -25,7 +25,7 @@ namespace {
 
 using ::testing::HasSubstr;
 
-class CpuHloSupportCheckerTest : public HloVerifiedTestBase {
+class CpuHloSupportCheckerTest : public HloTestBase {
  protected:
   CpuHloSupportChecker& checker() { return checker_; }
 
@@ -42,10 +42,10 @@ TEST_F(CpuHloSupportCheckerTest, Add) {
       HloInstruction::CreateParameter(1, scalar_shape, "param1"));
   builder.AddInstruction(HloInstruction::CreateBinary(
       scalar_shape, HloOpcode::kAdd, param0, param1));
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
-  TF_ASSERT_OK(checker().Run(module).status());
+  TF_ASSERT_OK(checker().Run(module.get()).status());
 }
 
 TEST_F(CpuHloSupportCheckerTest, SparseUnimplemented) {
@@ -60,7 +60,7 @@ TEST_F(CpuHloSupportCheckerTest, SparseUnimplemented) {
   // Since verifier is reporting sparse layouts as errors, we should
   // use a regular HloModule instead of VerifiedHloModule to avoid
   // verifier errors being triggered in the destructor.
-  auto module = HloTestBase::CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   module->AddEntryComputation(builder.Build());
 
   Status status = checker().Run(module.get()).status();
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion_test.cc b/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion_test.cc
index 7d99b914d4f..c95a514ca04 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion_test.cc
@@ -58,7 +58,7 @@ TEST_F(InstructionFusionTest, DotOperationFusion_Basic_0) {
   HloInstruction* dot = builder.AddInstruction(
       MakeDot(ShapeUtil::MakeShape(F32, {1024, 1}), exp0, arg1));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
   EXPECT_EQ(dot, computation->root_instruction());
   EXPECT_TRUE(CpuInstructionFusion().Run(module.get()).ValueOrDie());
@@ -77,7 +77,7 @@ TEST_F(InstructionFusionTest, DotOperationFusion_Basic_1) {
   HloInstruction* dot = builder.AddInstruction(
       MakeDot(ShapeUtil::MakeShape(F32, {1, 1024}), arg0, exp1));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
   EXPECT_EQ(dot, computation->root_instruction());
   EXPECT_TRUE(CpuInstructionFusion().Run(module.get()).ValueOrDie());
@@ -98,7 +98,7 @@ TEST_F(InstructionFusionTest, DotOperationNoFusion_Bitcast) {
   HloInstruction* dot = builder.AddInstruction(
       MakeDot(ShapeUtil::MakeShape(F32, {1024, 1}), bitcast0, arg1));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
   EXPECT_EQ(dot, computation->root_instruction());
   EXPECT_FALSE(CpuInstructionFusion().Run(module.get()).ValueOrDie());
@@ -119,7 +119,7 @@ TEST_F(InstructionFusionTest, DotOperationFusion_Reshape) {
   HloInstruction* dot = builder.AddInstruction(
       MakeDot(ShapeUtil::MakeShape(F32, {1024, 1}), reshape0, arg1));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
   EXPECT_EQ(dot, computation->root_instruction());
   EXPECT_TRUE(CpuInstructionFusion().Run(module.get()).ValueOrDie());
@@ -138,7 +138,7 @@ TEST_F(InstructionFusionTest, DotOperationFusion_TooLarge) {
   HloInstruction* dot = builder.AddInstruction(
       MakeDot(ShapeUtil::MakeShape(F32, {1, 32 * 1024}), arg0, exp1));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
   EXPECT_EQ(dot, computation->root_instruction());
   EXPECT_FALSE(CpuInstructionFusion().Run(module.get()).ValueOrDie());
@@ -157,7 +157,7 @@ TEST_F(InstructionFusionTest, DotOperationFusion_ElementReuse) {
   HloInstruction* dot = builder.AddInstruction(
       MakeDot(ShapeUtil::MakeShape(F32, {2, 1024}), arg0, exp1));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
   EXPECT_EQ(dot, computation->root_instruction());
   EXPECT_FALSE(CpuInstructionFusion().Run(module.get()).ValueOrDie());
@@ -321,7 +321,7 @@ TEST_F(OpcodeFusionTest, Exponential_Reshape_Negate) {
   builder.AddInstruction(
       HloInstruction::CreateUnary(result_shape, HloOpcode::kNegate, reshape2));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   module->AddEntryComputation(builder.Build());
 
   RunFusionAndCheckOpcodesWereFused(
@@ -350,7 +350,7 @@ TEST_F(OpcodeFusionTest, Broadcast_Reshape_DynamicSlice_Tanh) {
   builder.AddInstruction(HloInstruction::CreateUnary(
       dynamic_slice_shape, HloOpcode::kTanh, dynamic_slice4));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   module->AddEntryComputation(builder.Build());
 
   RunFusionAndCheckOpcodesWereFused(
@@ -370,7 +370,7 @@ TEST_F(OpcodeFusionTest, Broadcast_Negate) {
   builder.AddInstruction(HloInstruction::CreateUnary(
       result_shape, HloOpcode::kNegate, broadcast1));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   module->AddEntryComputation(builder.Build());
 
   RunFusionAndCheckOpcodesWereFused(
@@ -392,7 +392,7 @@ TEST_F(OpcodeFusionTest, DynamicSlice_Negate) {
   builder.AddInstruction(HloInstruction::CreateUnary(
       result_shape, HloOpcode::kNegate, dynamic_slice2));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   module->AddEntryComputation(builder.Build());
 
   RunFusionAndCheckOpcodesWereFused(
@@ -410,7 +410,7 @@ TEST_F(OpcodeFusionTest, Exponential_Negate) {
   builder.AddInstruction(
       HloInstruction::CreateUnary(param_shape, HloOpcode::kNegate, exp1));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   module->AddEntryComputation(builder.Build());
 
   RunFusionAndCheckOpcodesWereFused(
@@ -429,7 +429,7 @@ TEST_F(OpcodeFusionTest, Reshape_Negate) {
   builder.AddInstruction(
       HloInstruction::CreateUnary(result_shape, HloOpcode::kNegate, reshape1));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   module->AddEntryComputation(builder.Build());
 
   RunFusionAndCheckOpcodesWereFused(
@@ -447,7 +447,7 @@ TEST_F(OpcodeFusionTest, Reverse_Negate) {
   builder.AddInstruction(
       HloInstruction::CreateUnary(param_shape, HloOpcode::kNegate, reverse1));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   module->AddEntryComputation(builder.Build());
 
   RunFusionAndCheckOpcodesWereFused(
@@ -466,7 +466,7 @@ TEST_F(OpcodeFusionTest, Slice_Negate) {
   builder.AddInstruction(HloInstruction::CreateUnary(
       ShapeUtil::MakeShape(S32, {2}), HloOpcode::kNegate, slice1));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   module->AddEntryComputation(builder.Build());
 
   RunFusionAndCheckOpcodesWereFused(
@@ -489,7 +489,7 @@ TEST_F(OpcodeFusionTest, Exponential_Transpose_Negate) {
   builder.AddInstruction(HloInstruction::CreateUnary(
       result_shape, HloOpcode::kNegate, transpose2));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   module->AddEntryComputation(builder.Build());
 
   RunFusionAndCheckOpcodesWereFused(
@@ -498,7 +498,7 @@ TEST_F(OpcodeFusionTest, Exponential_Transpose_Negate) {
 }
 
 TEST_F(OpcodeFusionTest, UnaryMapOfExp) {
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
 
   HloComputation::Builder builder(TestName());
   Shape shape = ShapeUtil::MakeShape(F32, {3, 4});
@@ -517,7 +517,7 @@ TEST_F(OpcodeFusionTest, UnaryMapOfExp) {
 }
 
 TEST_F(OpcodeFusionTest, BinaryMapOfExps) {
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
 
   HloComputation::Builder builder(TestName());
   Shape shape = ShapeUtil::MakeShape(F32, {3, 4});
@@ -542,7 +542,7 @@ TEST_F(OpcodeFusionTest, BinaryMapOfExps) {
 }
 
 TEST_F(OpcodeFusionTest, DynamicSliceWithDynamicUpdateSlice) {
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
 
   HloComputation::Builder builder(TestName());
   Shape full_shape = ShapeUtil::MakeShape(F32, {10, 100, 1000});
@@ -573,7 +573,7 @@ TEST_F(OpcodeFusionTest, DynamicSliceWithDynamicUpdateSlice) {
 }
 
 TEST_F(OpcodeFusionTest, MessOfFusibleNodes) {
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   HloComputation::Builder builder(TestName());
 
   Shape full_shape = ShapeUtil::MakeShape(F32, {4, 100, 10, 100, 50});
@@ -641,7 +641,7 @@ TEST_F(OpcodeFusionTest, ReuseViaImplicitBroadcastUnary) {
   builder.AddInstruction(
       HloInstruction::CreateUnary(large_shape, HloOpcode::kExp, small_exp));
 
-  std::unique_ptr<HloModule> module = CreateNewModule();
+  std::unique_ptr<HloModule> module = CreateNewUnverifiedModule();
   module->AddEntryComputation(builder.Build());
 
   auto did_fusion = CpuInstructionFusion().Run(module.get());
@@ -670,7 +670,7 @@ TEST_F(OpcodeFusionTest, ReuseViaImplicitBroadcastBinary) {
   builder.AddInstruction(HloInstruction::CreateBinary(
       large_shape, HloOpcode::kAdd, small_exp, large_param));
 
-  std::unique_ptr<HloModule> module = CreateNewModule();
+  std::unique_ptr<HloModule> module = CreateNewUnverifiedModule();
   module->AddEntryComputation(builder.Build());
 
   auto did_fusion = CpuInstructionFusion().Run(module.get());
@@ -712,7 +712,7 @@ void CreateComputationForDotAddOutputFusionTest(const string& test_name,
 }
 
 TEST_F(OpcodeFusionTest, DotAddOutputFusion_1x50x19) {
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   CreateComputationForDotAddOutputFusionTest(TestName(), module.get(), /*m=*/1,
                                              /*k=*/50, /*n=*/19,
                                              /*add_extra_use_for_dot=*/false);
@@ -725,7 +725,7 @@ TEST_F(OpcodeFusionTest, DotAddOutputFusion_1x50x19) {
 }
 
 TEST_F(OpcodeFusionTest, DotAddOutputFusion_19x50x1) {
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   CreateComputationForDotAddOutputFusionTest(TestName(), module.get(), /*m=*/19,
                                              /*k=*/50, /*n=*/1,
                                              /*add_extra_use_for_dot=*/false);
@@ -738,7 +738,7 @@ TEST_F(OpcodeFusionTest, DotAddOutputFusion_19x50x1) {
 }
 
 TEST_F(OpcodeFusionTest, DotAddOutputFusion_19x50x19) {
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   CreateComputationForDotAddOutputFusionTest(TestName(), module.get(), /*m=*/19,
                                              /*k=*/50, /*n=*/19,
                                              /*add_extra_use_for_dot=*/false);
@@ -751,7 +751,7 @@ TEST_F(OpcodeFusionTest, DotAddOutputFusion_19x50x19) {
 }
 
 TEST_F(OpcodeFusionTest, DotAddOutputFusion_19x50x1_multi_use) {
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   CreateComputationForDotAddOutputFusionTest(TestName(), module.get(), /*m=*/19,
                                              /*k=*/50, /*n=*/1,
                                              /*add_extra_use_for_dot=*/true);
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_layout_assignment_test.cc b/tensorflow/compiler/xla/service/cpu/cpu_layout_assignment_test.cc
index 97659b88a79..2cd52e4a18a 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_layout_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_layout_assignment_test.cc
@@ -73,7 +73,7 @@ TEST_F(CpuLayoutAssignmentTest, DotWithConstantRhsTensor) {
   auto result = builder.AddInstruction(
       CreateCanonicalDot(result_shape, dot_lhs, dot_rhs));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   HloComputation* computation = module->AddEntryComputation(builder.Build());
 
   ComputationLayout computation_layout(computation->ComputeProgramShape());
@@ -114,7 +114,7 @@ TEST_F(CpuLayoutAssignmentTest, MultipleDotsWithSameConstantRhsTensor0) {
   builder.AddInstruction(HloInstruction::CreateBinary(
       result_shape, HloOpcode::kAdd, dot_a_result, dot_b_result));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   HloComputation* computation = module->AddEntryComputation(builder.Build());
 
   ComputationLayout computation_layout(computation->ComputeProgramShape());
@@ -158,7 +158,7 @@ TEST_F(CpuLayoutAssignmentTest, MultipleDotsWithSameConstantRhsTensor1) {
   auto tuple_result = builder.AddInstruction(
       HloInstruction::CreateTuple({dot_a_result, dot_b_result}));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   HloComputation* computation = module->AddEntryComputation(builder.Build());
 
   ComputationLayout computation_layout(computation->ComputeProgramShape());
@@ -192,7 +192,7 @@ TEST_F(CpuLayoutAssignmentTest, DotWithConstantLhsTensor) {
   auto dot_result = builder.AddInstruction(
       CreateCanonicalDot(result_shape, dot_lhs, dot_rhs));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   HloComputation* computation = module->AddEntryComputation(builder.Build());
 
   ComputationLayout computation_layout(computation->ComputeProgramShape());
@@ -232,7 +232,7 @@ TEST_F(CpuLayoutAssignmentTest, DotWithConstantRhsTensorThroughGTE) {
   auto dot_result = builder.AddInstruction(
       CreateCanonicalDot(result_shape, dot_lhs, dot_rhs));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   HloComputation* computation = module->AddEntryComputation(builder.Build());
 
   ComputationLayout computation_layout(computation->ComputeProgramShape());
@@ -353,7 +353,7 @@ static void AssertCorrectLayoutForDotOutputFusion(
 }
 
 TEST_F(CpuLayoutAssignmentTest, DotOutputFusion_1x50x19_dot_idx_0) {
-  std::unique_ptr<HloModule> module = CreateNewModule();
+  std::unique_ptr<HloModule> module = CreateNewUnverifiedModule();
   TF_ASSERT_OK_AND_ASSIGN(
       DotOutputFusionLayoutAssignmentResult layout_assignment_result,
       RunDotOutputFusion(module.get(), TestName(), /*m=*/1, /*k=*/50, /*n=*/19,
@@ -365,7 +365,7 @@ TEST_F(CpuLayoutAssignmentTest, DotOutputFusion_1x50x19_dot_idx_0) {
 }
 
 TEST_F(CpuLayoutAssignmentTest, DotOutputFusion_1x50x19_dot_idx_1) {
-  std::unique_ptr<HloModule> module = CreateNewModule();
+  std::unique_ptr<HloModule> module = CreateNewUnverifiedModule();
   TF_ASSERT_OK_AND_ASSIGN(
       DotOutputFusionLayoutAssignmentResult layout_assignment_result,
       RunDotOutputFusion(module.get(), TestName(), /*m=*/1, /*k=*/50, /*n=*/19,
@@ -377,7 +377,7 @@ TEST_F(CpuLayoutAssignmentTest, DotOutputFusion_1x50x19_dot_idx_1) {
 }
 
 TEST_F(CpuLayoutAssignmentTest, DotOutputFusion_19x50x1_dot_idx_0) {
-  std::unique_ptr<HloModule> module = CreateNewModule();
+  std::unique_ptr<HloModule> module = CreateNewUnverifiedModule();
   TF_ASSERT_OK_AND_ASSIGN(
       DotOutputFusionLayoutAssignmentResult layout_assignment_result,
       RunDotOutputFusion(module.get(), TestName(), /*m=*/19, /*k=*/50, /*n=*/1,
@@ -389,7 +389,7 @@ TEST_F(CpuLayoutAssignmentTest, DotOutputFusion_19x50x1_dot_idx_0) {
 }
 
 TEST_F(CpuLayoutAssignmentTest, DotOutputFusion_19x50x1_dot_idx_1) {
-  std::unique_ptr<HloModule> module = CreateNewModule();
+  std::unique_ptr<HloModule> module = CreateNewUnverifiedModule();
   TF_ASSERT_OK_AND_ASSIGN(
       DotOutputFusionLayoutAssignmentResult layout_assignment_result,
       RunDotOutputFusion(module.get(), TestName(), /*m=*/19, /*k=*/50, /*n=*/1,
@@ -401,7 +401,7 @@ TEST_F(CpuLayoutAssignmentTest, DotOutputFusion_19x50x1_dot_idx_1) {
 }
 
 TEST_F(CpuLayoutAssignmentTest, DotOutputFusion_19x50x19_dot_idx_0) {
-  std::unique_ptr<HloModule> module = CreateNewModule();
+  std::unique_ptr<HloModule> module = CreateNewUnverifiedModule();
   TF_ASSERT_OK_AND_ASSIGN(
       DotOutputFusionLayoutAssignmentResult layout_assignment_result,
       RunDotOutputFusion(module.get(), TestName(), /*m=*/19, /*k=*/50, /*n=*/19,
@@ -413,7 +413,7 @@ TEST_F(CpuLayoutAssignmentTest, DotOutputFusion_19x50x19_dot_idx_0) {
 }
 
 TEST_F(CpuLayoutAssignmentTest, DotOutputFusion_19x50x19_dot_idx_1) {
-  std::unique_ptr<HloModule> module = CreateNewModule();
+  std::unique_ptr<HloModule> module = CreateNewUnverifiedModule();
   TF_ASSERT_OK_AND_ASSIGN(
       DotOutputFusionLayoutAssignmentResult layout_assignment_result,
       RunDotOutputFusion(module.get(), TestName(), /*m=*/19, /*k=*/50, /*n=*/19,
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
index d6968323f33..620c45fa391 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
@@ -1536,7 +1536,8 @@ IrEmitter::ReductionGenerator IrEmitter::MatchReductionGenerator(
 
     case HloOpcode::kMaximum:
       return [root_is_floating_point, root_is_signed](
-                 llvm::IRBuilder<>* b, llvm::Value* lhs, llvm::Value* rhs) {
+                 llvm::IRBuilder<>* b, llvm::Value* lhs,
+                 llvm::Value* rhs) -> llvm::Value* {
         if (root_is_floating_point) {
           return llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::maxnum,
                                               {lhs, rhs}, {lhs->getType()}, b);
@@ -1551,7 +1552,8 @@ IrEmitter::ReductionGenerator IrEmitter::MatchReductionGenerator(
 
     case HloOpcode::kMinimum:
       return [root_is_floating_point, root_is_signed](
-                 llvm::IRBuilder<>* b, llvm::Value* lhs, llvm::Value* rhs) {
+                 llvm::IRBuilder<>* b, llvm::Value* lhs,
+                 llvm::Value* rhs) -> llvm::Value* {
         if (root_is_floating_point) {
           return llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::minnum,
                                               {lhs, rhs}, {lhs->getType()}, b);
diff --git a/tensorflow/compiler/xla/service/cpu/parallel_task_assignment_test.cc b/tensorflow/compiler/xla/service/cpu/parallel_task_assignment_test.cc
index fad76338a57..f0b65046c14 100644
--- a/tensorflow/compiler/xla/service/cpu/parallel_task_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/parallel_task_assignment_test.cc
@@ -17,13 +17,13 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/cpu/cpu_executable.h"
 #include "tensorflow/compiler/xla/service/cpu/target_machine_features_fake.h"
 #include "tensorflow/compiler/xla/test.h"
-#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 
 namespace xla {
 namespace {
 
-class ParallelTaskAssignmentTest : public HloVerifiedTestBase {
+class ParallelTaskAssignmentTest : public HloTestBase {
  protected:
   const HloCostAnalysis::ShapeSizeFunction shape_size_func_ =
       cpu::CpuExecutable::ShapeSizeBytes;
@@ -35,7 +35,7 @@ class ParallelTaskAssignmentTest : public HloVerifiedTestBase {
   cpu::TargetMachineFeaturesWithFakeAlignmentLogic target_machine_features_;
 
   ParallelTaskAssignmentTest()
-      : HloVerifiedTestBase(), target_machine_features_([](int64 shape_size) {
+      : HloTestBase(), target_machine_features_([](int64 shape_size) {
           return cpu::TargetMachineFeatures::kEigenExpectedTensorAlignment;
         }) {}
 
@@ -57,8 +57,9 @@ TEST_F(ParallelTaskAssignmentTest, DotOperationNotParallelized) {
     }
   )";
 
-  ParseAndVerifyModule(hlo_string);
-  TF_ASSERT_OK_AND_ASSIGN(bool changed, RunParallelTaskAssigner(&module()));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> m,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, RunParallelTaskAssigner(m.get()));
   EXPECT_FALSE(changed);
 }
 
@@ -84,8 +85,9 @@ TEST_F(ParallelTaskAssignmentTest,
     }
   )";
 
-  ParseAndVerifyModule(hlo_string);
-  TF_ASSERT_OK_AND_ASSIGN(bool changed, RunParallelTaskAssigner(&module()));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> m,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, RunParallelTaskAssigner(m.get()));
   EXPECT_FALSE(changed);
 }
 
@@ -100,8 +102,9 @@ TEST_F(ParallelTaskAssignmentTest, RngOperationNotParallelized) {
     }
   )";
 
-  ParseAndVerifyModule(hlo_string);
-  TF_ASSERT_OK_AND_ASSIGN(bool changed, RunParallelTaskAssigner(&module()));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> m,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, RunParallelTaskAssigner(m.get()));
   EXPECT_FALSE(changed);
 }
 
@@ -116,8 +119,9 @@ TEST_F(ParallelTaskAssignmentTest, InfeedOutfeedOperationNotParallelized) {
     }
   )";
 
-  ParseAndVerifyModule(hlo_string);
-  TF_ASSERT_OK_AND_ASSIGN(bool changed, RunParallelTaskAssigner(&module()));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> m,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, RunParallelTaskAssigner(m.get()));
   EXPECT_FALSE(changed);
 }
 
diff --git a/tensorflow/compiler/xla/service/cpu/shape_partition_test.cc b/tensorflow/compiler/xla/service/cpu/shape_partition_test.cc
index 1a3d82de954..7d8e51f909e 100644
--- a/tensorflow/compiler/xla/service/cpu/shape_partition_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/shape_partition_test.cc
@@ -19,14 +19,14 @@ limitations under the License.
 #include <random>
 
 #include "tensorflow/compiler/xla/test_helpers.h"
-#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/util.h"
 
 namespace xla {
 namespace cpu {
 namespace {
 
-class ShapePartitionAssignerTest : public HloVerifiedTestBase {
+class ShapePartitionAssignerTest : public HloTestBase {
  protected:
   typedef std::vector<int64> Vec;
 
@@ -91,7 +91,7 @@ TEST_F(ShapePartitionAssignerTest, Shape532WithLayout201) {
             expected_partitions);
 }
 
-class ShapePartitionIteratorTest : public HloVerifiedTestBase {
+class ShapePartitionIteratorTest : public HloTestBase {
  protected:
   typedef std::vector<std::pair<int64, int64>> Partition;
 };
@@ -145,7 +145,7 @@ TEST_F(ShapePartitionIteratorTest, Shape532WithLayout210) {
   }
 }
 
-class RandomShapePartitionIteratorTest : public HloVerifiedTestBase {
+class RandomShapePartitionIteratorTest : public HloTestBase {
  protected:
   typedef std::vector<std::pair<int64, int64>> Partition;
   RandomShapePartitionIteratorTest()
diff --git a/tensorflow/compiler/xla/service/cpu/tests/BUILD b/tensorflow/compiler/xla/service/cpu/tests/BUILD
index 4b129c95d46..382dfd0d99d 100644
--- a/tensorflow/compiler/xla/service/cpu/tests/BUILD
+++ b/tensorflow/compiler/xla/service/cpu/tests/BUILD
@@ -48,7 +48,6 @@ tf_cc_test(
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service/cpu:cpu_instruction_fusion",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
diff --git a/tensorflow/compiler/xla/service/cpu/tests/cpu_eigen_dot_operation_test.cc b/tensorflow/compiler/xla/service/cpu/tests/cpu_eigen_dot_operation_test.cc
index 18ee25ba915..691b3c7bee2 100644
--- a/tensorflow/compiler/xla/service/cpu/tests/cpu_eigen_dot_operation_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/tests/cpu_eigen_dot_operation_test.cc
@@ -50,7 +50,7 @@ class CpuEigenDotOperationTest
         /*entry_point_name=*/"entry",
         /*relocation_model=*/CpuAotCompilationOptions::RelocationModel::Static};
 
-    auto hlo_module = CreateNewModule();
+    auto hlo_module = CreateNewUnverifiedModule();
     hlo_module->AddEntryComputation(std::move(entry_computation));
 
     CompileAheadOfTimeAndVerifyIr(std::move(hlo_module), options,
diff --git a/tensorflow/compiler/xla/service/cpu/tests/cpu_external_constants_test.cc b/tensorflow/compiler/xla/service/cpu/tests/cpu_external_constants_test.cc
index 00a7aa2ad2f..d201a151d7a 100644
--- a/tensorflow/compiler/xla/service/cpu/tests/cpu_external_constants_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/tests/cpu_external_constants_test.cc
@@ -46,7 +46,7 @@ class CpuExternalConstantsTest : public CpuCodegenTest {
     builder.AddInstruction(
         HloInstruction::CreateBinary(shape, HloOpcode::kAdd, param, constant));
 
-    std::unique_ptr<HloModule> module = CreateNewModule();
+    std::unique_ptr<HloModule> module = CreateNewUnverifiedModule();
     module->AddEntryComputation(builder.Build());
 
     CompileAndVerifyIr(std::move(module), filecheck_pattern,
diff --git a/tensorflow/compiler/xla/service/cpu/tests/cpu_fusion_test.cc b/tensorflow/compiler/xla/service/cpu/tests/cpu_fusion_test.cc
index 1deb412064b..04a81dfd35f 100644
--- a/tensorflow/compiler/xla/service/cpu/tests/cpu_fusion_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/tests/cpu_fusion_test.cc
@@ -25,7 +25,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/shape_util.h"
-#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/platform/test.h"
@@ -34,7 +34,7 @@ namespace xla {
 namespace cpu {
 namespace {
 
-class CpuFusionTest : public HloVerifiedTestBase {
+class CpuFusionTest : public HloTestBase {
  protected:
   CpuFusionTest() {}
 
@@ -57,11 +57,11 @@ TEST_F(CpuFusionTest, FuseTwoElementwiseOps) {
   builder.AddInstruction(
       HloInstruction::CreateUnary(vshape, HloOpcode::kNegate, add1));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   CpuInstructionFusion fusion;
-  EXPECT_TRUE(fusion.Run(module).ValueOrDie());
+  EXPECT_TRUE(fusion.Run(module.get()).ValueOrDie());
 
   // The computation root instruction was fused. Verify the fusion instruction
   // is now the root.
@@ -104,11 +104,11 @@ TEST_F(CpuFusionTest, FuseElementwiseOpChain) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(vshape, HloOpcode::kMultiply, two, floor));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   CpuInstructionFusion fusion;
-  EXPECT_TRUE(fusion.Run(module).ValueOrDie());
+  EXPECT_TRUE(fusion.Run(module.get()).ValueOrDie());
 
   // The computation root instruction was fused. Verify the fusion instruction
   // is now the root.
@@ -131,7 +131,7 @@ TEST_F(CpuFusionTest, FuseElementwiseOpChain) {
 TEST_F(CpuFusionTest, ElementwiseOpChainWithNonfusibleInstruction) {
   // Test a chain of fusible ops with a non-fusible op (a reduce) thrown in the
   // middle.
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto builder = HloComputation::Builder(TestName());
   auto input_literal = LiteralUtil::CreateR1<float>({-1.5, -2.5, -3.0});
   Shape vshape = input_literal.shape();
@@ -183,7 +183,7 @@ TEST_F(CpuFusionTest, ElementwiseOpChainWithNonfusibleInstruction) {
   module->AddEntryComputation(builder.Build());
 
   CpuInstructionFusion fusion;
-  EXPECT_TRUE(fusion.Run(module).ValueOrDie());
+  EXPECT_TRUE(fusion.Run(module.get()).ValueOrDie());
 
   // The computation root instruction was fused. Verify the fusion instruction
   // is now the root.
@@ -250,12 +250,12 @@ TEST_F(CpuFusionTest, TestOperandOrderToAvoidDuplication) {
       builder.AddInstruction(HloInstruction::CreateTuple({add1, add2}));
 
   // Create computation and module.
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   // Run fusion.
   CpuInstructionFusion fusion;
-  EXPECT_TRUE(fusion.Run(module).ValueOrDie());
+  EXPECT_TRUE(fusion.Run(module.get()).ValueOrDie());
 
   auto fusion1 = result->operand(0);
   auto fusion2 = result->operand(1);
@@ -310,11 +310,11 @@ TEST_F(CpuFusionTest, DoNotDuplicateExpensiveOps) {
   auto tuple = builder.AddInstruction(
       HloInstruction::CreateTuple({negate1, negate2, exp2}));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   CpuInstructionFusion fusion;
-  EXPECT_TRUE(fusion.Run(module).ValueOrDie());
+  EXPECT_TRUE(fusion.Run(module.get()).ValueOrDie());
 
   // The only fusion instruction should be operand 0 of the tuple (formerly
   // negate1).
diff --git a/tensorflow/compiler/xla/service/cpu/tests/cpu_intrinsic_test.cc b/tensorflow/compiler/xla/service/cpu/tests/cpu_intrinsic_test.cc
index a434c04a980..773336c7a92 100644
--- a/tensorflow/compiler/xla/service/cpu/tests/cpu_intrinsic_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/tests/cpu_intrinsic_test.cc
@@ -91,7 +91,7 @@ TEST_P(CpuUnaryIntrinsicTest, DoIt) {
       /*entry_point_name=*/"entry",
       /*relocation_model=*/CpuAotCompilationOptions::RelocationModel::Static};
 
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewUnverifiedModule();
   hlo_module->AddEntryComputation(std::move(computation));
 
   string check_lines{spec.check_lines.data(), spec.check_lines.size()};
diff --git a/tensorflow/compiler/xla/service/cpu/tests/cpu_noalias_test.cc b/tensorflow/compiler/xla/service/cpu/tests/cpu_noalias_test.cc
index b35fd9dad87..f5419b7063b 100644
--- a/tensorflow/compiler/xla/service/cpu/tests/cpu_noalias_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/tests/cpu_noalias_test.cc
@@ -56,7 +56,7 @@ TEST_F(CpuNoAliasTest, Concat) {
 
   std::unique_ptr<HloComputation> computation = builder.Build();
 
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewUnverifiedModule();
   hlo_module->AddEntryComputation(std::move(computation));
 
   // Now that we have an HLO module, build an llvm_ir::AliasAnalysis for it.
diff --git a/tensorflow/compiler/xla/service/defuser_test.cc b/tensorflow/compiler/xla/service/defuser_test.cc
index e727ba49cb6..64fb5031839 100644
--- a/tensorflow/compiler/xla/service/defuser_test.cc
+++ b/tensorflow/compiler/xla/service/defuser_test.cc
@@ -18,19 +18,19 @@ limitations under the License.
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
 #include "tensorflow/compiler/xla/shape_util.h"
-#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 
 namespace op = xla::testing::opcode_matchers;
 
 namespace xla {
 namespace {
 
-class DefuserTest : public HloVerifiedTestBase {
+class DefuserTest : public HloTestBase {
  protected:
   // Returns the number of fusion instructions in the module.
-  int FusionCount() {
+  int FusionCount(const HloModule* m) {
     int count = 0;
-    for (HloComputation* computation : module().computations()) {
+    for (HloComputation* computation : m->computations()) {
       if (computation->IsFusionComputation()) {
         count++;
       }
@@ -43,6 +43,7 @@ class DefuserTest : public HloVerifiedTestBase {
 };
 
 TEST_F(DefuserTest, NoFusionInstruction) {
+  auto m = CreateNewVerifiedModule();
   auto builder = HloComputation::Builder(TestName());
   auto param0 =
       builder.AddInstruction(HloInstruction::CreateParameter(0, shape_, "p0"));
@@ -51,13 +52,14 @@ TEST_F(DefuserTest, NoFusionInstruction) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(shape_, HloOpcode::kAdd, param0, param1));
 
-  module().AddEntryComputation(builder.Build());
-  EXPECT_EQ(0, FusionCount());
+  m->AddEntryComputation(builder.Build());
+  EXPECT_EQ(0, FusionCount(m.get()));
 
-  EXPECT_FALSE(defuser_.Run(&module()).ValueOrDie());
+  EXPECT_FALSE(defuser_.Run(m.get()).ValueOrDie());
 }
 
 TEST_F(DefuserTest, TrivialFusionInstructionAsRoot) {
+  auto m = CreateNewVerifiedModule();
   auto builder = HloComputation::Builder(TestName());
   auto param0 =
       builder.AddInstruction(HloInstruction::CreateParameter(0, shape_, "p0"));
@@ -66,21 +68,22 @@ TEST_F(DefuserTest, TrivialFusionInstructionAsRoot) {
   auto add = builder.AddInstruction(
       HloInstruction::CreateBinary(shape_, HloOpcode::kAdd, param0, param1));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
   computation->CreateFusionInstruction({add},
                                        HloInstruction::FusionKind::kLoop);
 
   EXPECT_THAT(computation->root_instruction(), op::Fusion());
 
-  EXPECT_EQ(1, FusionCount());
-  EXPECT_TRUE(defuser_.Run(&module()).ValueOrDie());
-  EXPECT_EQ(0, FusionCount());
+  EXPECT_EQ(1, FusionCount(m.get()));
+  EXPECT_TRUE(defuser_.Run(m.get()).ValueOrDie());
+  EXPECT_EQ(0, FusionCount(m.get()));
 
   EXPECT_THAT(computation->root_instruction(),
               op::Add(op::Parameter(), op::Parameter()));
 }
 
 TEST_F(DefuserTest, TrivialFusionInstructionNotAsRoot) {
+  auto m = CreateNewVerifiedModule();
   auto builder = HloComputation::Builder(TestName());
   auto param0 =
       builder.AddInstruction(HloInstruction::CreateParameter(0, shape_, "p0"));
@@ -91,21 +94,22 @@ TEST_F(DefuserTest, TrivialFusionInstructionNotAsRoot) {
   builder.AddInstruction(
       HloInstruction::CreateUnary(shape_, HloOpcode::kNegate, add));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
   computation->CreateFusionInstruction({add},
                                        HloInstruction::FusionKind::kLoop);
 
   EXPECT_THAT(computation->root_instruction(), op::Negate(op::Fusion()));
 
-  EXPECT_EQ(1, FusionCount());
-  EXPECT_TRUE(defuser_.Run(&module()).ValueOrDie());
-  EXPECT_EQ(0, FusionCount());
+  EXPECT_EQ(1, FusionCount(m.get()));
+  EXPECT_TRUE(defuser_.Run(m.get()).ValueOrDie());
+  EXPECT_EQ(0, FusionCount(m.get()));
 
   EXPECT_THAT(computation->root_instruction(),
               op::Negate(op::Add(op::Parameter(), op::Parameter())));
 }
 
 TEST_F(DefuserTest, NonTrivialFusionInstruction) {
+  auto m = CreateNewVerifiedModule();
   auto builder = HloComputation::Builder(TestName());
   auto param0 =
       builder.AddInstruction(HloInstruction::CreateParameter(0, shape_, "p0"));
@@ -128,22 +132,23 @@ TEST_F(DefuserTest, NonTrivialFusionInstruction) {
   auto add2 = builder.AddInstruction(
       HloInstruction::CreateBinary(shape_, HloOpcode::kAdd, constant, div));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
   computation->CreateFusionInstruction(
       {add2, constant, div, mul, sub, negate, add},
       HloInstruction::FusionKind::kLoop);
 
   EXPECT_THAT(computation->root_instruction(), op::Fusion());
 
-  EXPECT_EQ(1, FusionCount());
-  EXPECT_TRUE(defuser_.Run(&module()).ValueOrDie());
-  EXPECT_EQ(0, FusionCount());
+  EXPECT_EQ(1, FusionCount(m.get()));
+  EXPECT_TRUE(defuser_.Run(m.get()).ValueOrDie());
+  EXPECT_EQ(0, FusionCount(m.get()));
 
   EXPECT_THAT(computation->root_instruction(),
               op::Add(op::Constant(), op::Divide()));
 }
 
 TEST_F(DefuserTest, MultipleFusionInstructions) {
+  auto m = CreateNewVerifiedModule();
   auto builder = HloComputation::Builder(TestName());
   auto param0 =
       builder.AddInstruction(HloInstruction::CreateParameter(0, shape_, "p0"));
@@ -166,7 +171,7 @@ TEST_F(DefuserTest, MultipleFusionInstructions) {
   auto add2 = builder.AddInstruction(
       HloInstruction::CreateBinary(shape_, HloOpcode::kAdd, constant, div));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
   computation->CreateFusionInstruction({add2, constant, div, mul},
                                        HloInstruction::FusionKind::kLoop);
   computation->CreateFusionInstruction({sub, negate, add},
@@ -174,15 +179,16 @@ TEST_F(DefuserTest, MultipleFusionInstructions) {
 
   EXPECT_THAT(computation->root_instruction(), op::Fusion());
 
-  EXPECT_EQ(2, FusionCount());
-  EXPECT_TRUE(defuser_.Run(&module()).ValueOrDie());
-  EXPECT_EQ(0, FusionCount());
+  EXPECT_EQ(2, FusionCount(m.get()));
+  EXPECT_TRUE(defuser_.Run(m.get()).ValueOrDie());
+  EXPECT_EQ(0, FusionCount(m.get()));
 
   EXPECT_THAT(computation->root_instruction(),
               op::Add(op::Constant(), op::Divide()));
 }
 
 TEST_F(DefuserTest, NestedFusionInstructions) {
+  auto m = CreateNewVerifiedModule();
   auto builder = HloComputation::Builder(TestName());
   auto param0 =
       builder.AddInstruction(HloInstruction::CreateParameter(0, shape_, "p0"));
@@ -193,7 +199,7 @@ TEST_F(DefuserTest, NestedFusionInstructions) {
   auto negate = builder.AddInstruction(
       HloInstruction::CreateUnary(shape_, HloOpcode::kNegate, add));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
   auto outer_fusion = computation->CreateFusionInstruction(
       {negate, add}, HloInstruction::FusionKind::kLoop);
   HloInstruction* fused_negate = outer_fusion->fused_expression_root();
@@ -203,9 +209,9 @@ TEST_F(DefuserTest, NestedFusionInstructions) {
 
   EXPECT_THAT(computation->root_instruction(), op::Fusion());
 
-  EXPECT_EQ(2, FusionCount());
-  EXPECT_TRUE(defuser_.Run(&module()).ValueOrDie());
-  EXPECT_EQ(0, FusionCount());
+  EXPECT_EQ(2, FusionCount(m.get()));
+  EXPECT_TRUE(defuser_.Run(m.get()).ValueOrDie());
+  EXPECT_EQ(0, FusionCount(m.get()));
 
   EXPECT_THAT(computation->root_instruction(), op::Negate(op::Add()));
 }
diff --git a/tensorflow/compiler/xla/service/dfs_hlo_visitor.h b/tensorflow/compiler/xla/service/dfs_hlo_visitor.h
index 4159aa281fa..d6371283221 100644
--- a/tensorflow/compiler/xla/service/dfs_hlo_visitor.h
+++ b/tensorflow/compiler/xla/service/dfs_hlo_visitor.h
@@ -108,6 +108,7 @@ class DfsHloVisitorBase {
   virtual Status HandleCrossReplicaSum(HloInstructionPtr hlo) = 0;
   virtual Status HandleAllToAll(HloInstructionPtr hlo) = 0;
   virtual Status HandleCollectivePermute(HloInstructionPtr hlo) = 0;
+  virtual Status HandleGetDimensionSize(HloInstructionPtr hlo) = 0;
   virtual Status HandleCompare(HloInstructionPtr hlo) {
     return HandleElementwiseBinary(hlo);
   }
diff --git a/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h b/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h
index 4cd10ab06cd..e57184f639f 100644
--- a/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h
+++ b/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h
@@ -203,6 +203,9 @@ class DfsHloVisitorWithDefaultBase
   Status HandleAfterAll(HloInstructionPtr token) override {
     return DefaultAction(token);
   }
+  Status HandleGetDimensionSize(HloInstructionPtr get_size) override {
+    return DefaultAction(get_size);
+  }
 
   // Invoked to inform the visitor that the traversal has completed, and that
   // the root was "root".
diff --git a/tensorflow/compiler/xla/service/elemental_ir_emitter.cc b/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
index 515267edd7c..f98c943669b 100644
--- a/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
@@ -1815,8 +1815,6 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitElementalGather(
     // Clamp the gather index so that the gather region fits in the operand.
     // gather_dim_component_extended_inbound =
     //     clamp(gather_dim_component_extended, 0, largest_valid_start_index);
-
-    // TODO(b/111078873): This is implementation defined behavior.
     bool is_signed = ShapeUtil::ElementIsSigned(indices_shape);
     auto gather_dim_component_extended_inbound = EmitIntegralMin(
         index.GetConstantWithIndexType(largest_valid_start_index),
diff --git a/tensorflow/compiler/xla/service/executable.cc b/tensorflow/compiler/xla/service/executable.cc
index 47c56e2f7fb..10b8c01ff13 100644
--- a/tensorflow/compiler/xla/service/executable.cc
+++ b/tensorflow/compiler/xla/service/executable.cc
@@ -17,7 +17,7 @@ limitations under the License.
 
 #include "absl/memory/memory.h"
 #include "absl/strings/str_format.h"
-#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
+#include "tensorflow/compiler/xla/debug_options_flags.h"
 #include "tensorflow/compiler/xla/service/hlo_graph_dumper.h"
 #include "tensorflow/compiler/xla/status.h"
 #include "tensorflow/compiler/xla/status_macros.h"
diff --git a/tensorflow/compiler/xla/service/executable.h b/tensorflow/compiler/xla/service/executable.h
index 3a6780f2a67..45f620f3f33 100644
--- a/tensorflow/compiler/xla/service/executable.h
+++ b/tensorflow/compiler/xla/service/executable.h
@@ -22,7 +22,7 @@ limitations under the License.
 
 #include "absl/types/span.h"
 #include "absl/types/variant.h"
-#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
+#include "tensorflow/compiler/xla/debug_options_flags.h"
 #include "tensorflow/compiler/xla/service/computation_layout.h"
 #include "tensorflow/compiler/xla/service/device_memory_allocator.h"
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
diff --git a/tensorflow/compiler/xla/service/flatten_call_graph_test.cc b/tensorflow/compiler/xla/service/flatten_call_graph_test.cc
index 5fbd73a5363..8eeb930b481 100644
--- a/tensorflow/compiler/xla/service/flatten_call_graph_test.cc
+++ b/tensorflow/compiler/xla/service/flatten_call_graph_test.cc
@@ -22,7 +22,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
-#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
@@ -30,7 +30,7 @@ limitations under the License.
 namespace xla {
 namespace {
 
-class FlattenCallGraphTest : public HloVerifiedTestBase {
+class FlattenCallGraphTest : public HloTestBase {
  protected:
   // Build and return a trivial computation taking and returning a scalar.
   std::unique_ptr<HloComputation> MakeScalarComputation() {
@@ -108,7 +108,7 @@ TEST_F(FlattenCallGraphTest, ComplexGraph) {
   //    c
   //
   // Calls are made via kCall, kWhile, and kMap instructions.
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation* cond_computation =
       module->AddEmbeddedComputation(MakeConditionComputation());
   HloComputation* c_computation =
@@ -139,9 +139,9 @@ TEST_F(FlattenCallGraphTest, ComplexGraph) {
   }
 
   {
-    TF_ASSERT_OK_AND_ASSIGN(bool result, RunFlattenCallGraph(module));
+    TF_ASSERT_OK_AND_ASSIGN(bool result, RunFlattenCallGraph(module.get()));
     EXPECT_TRUE(result);
-    std::unique_ptr<CallGraph> flat_call_graph = CallGraph::Build(module);
+    std::unique_ptr<CallGraph> flat_call_graph = CallGraph::Build(module.get());
     const CallGraphNode& c_node = flat_call_graph->GetNode(c_computation);
     EXPECT_EQ(1, c_node.caller_callsites().size());
   }
@@ -149,7 +149,7 @@ TEST_F(FlattenCallGraphTest, ComplexGraph) {
 
 // Test corner case of a computation used as a body and a loop condition.
 TEST_F(FlattenCallGraphTest, SharedWhileConditionAndBody) {
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation* cond_computation;
   {
     HloComputation::Builder builder(TestName() + ".cond");
@@ -176,15 +176,15 @@ TEST_F(FlattenCallGraphTest, SharedWhileConditionAndBody) {
   }
 
   {
-    std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module);
+    std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module.get());
     const CallGraphNode& cond_node = call_graph->GetNode(cond_computation);
     EXPECT_EQ(2, cond_node.caller_callsites().size());
   }
 
   {
-    TF_ASSERT_OK_AND_ASSIGN(bool result, RunFlattenCallGraph(module));
+    TF_ASSERT_OK_AND_ASSIGN(bool result, RunFlattenCallGraph(module.get()));
     EXPECT_TRUE(result);
-    std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module);
+    std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module.get());
     const CallGraphNode& cond_node = call_graph->GetNode(cond_computation);
     EXPECT_EQ(1, cond_node.caller_callsites().size());
   }
@@ -201,7 +201,7 @@ TEST_F(FlattenCallGraphTest, SharedWhileConditionAndBody) {
 //     C
 //
 TEST_F(FlattenCallGraphTest, FlattenCalls) {
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation* c_computation =
       module->AddEmbeddedComputation(MakeScalarComputation());
 
@@ -211,9 +211,9 @@ TEST_F(FlattenCallGraphTest, FlattenCalls) {
   module->AddEntryComputation(
       MakeCallingComputation(b_computation, /*callsites=*/2, ".Entry"));
 
-  TF_ASSERT_OK_AND_ASSIGN(bool result, RunFlattenCallGraph(module));
+  TF_ASSERT_OK_AND_ASSIGN(bool result, RunFlattenCallGraph(module.get()));
   EXPECT_TRUE(result);
-  std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module);
+  std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module.get());
   EXPECT_EQ(7, module->computation_count());
 
   const CallGraphNode& c_node = call_graph->GetNode(c_computation);
@@ -224,7 +224,7 @@ TEST_F(FlattenCallGraphTest, FlattenCalls) {
 }
 
 TEST_F(FlattenCallGraphTest, FlattenCallsInConditional) {
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation* sub_computation =
       module->AddEmbeddedComputation(MakeScalarComputation());
 
@@ -243,9 +243,9 @@ TEST_F(FlattenCallGraphTest, FlattenCallsInConditional) {
   module->AddEntryComputation(builder.Build());
   EXPECT_EQ(2, module->computation_count());
 
-  TF_ASSERT_OK_AND_ASSIGN(bool result, RunFlattenCallGraph(module));
+  TF_ASSERT_OK_AND_ASSIGN(bool result, RunFlattenCallGraph(module.get()));
   EXPECT_TRUE(result);
-  std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module);
+  std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module.get());
   // The true and false computations must now be different.
   EXPECT_EQ(3, module->computation_count());
 
diff --git a/tensorflow/compiler/xla/service/gpu/BUILD b/tensorflow/compiler/xla/service/gpu/BUILD
index 1e8435fe542..b1629616acd 100644
--- a/tensorflow/compiler/xla/service/gpu/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/BUILD
@@ -111,7 +111,6 @@ tf_cc_test(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
         "//tensorflow/compiler/xla/tests:test_utils",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:lib",
@@ -463,7 +462,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_matchers",
         "//tensorflow/compiler/xla/service:shape_inference",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",  # fixdeps: keep
         "//tensorflow/core:test",
     ],
@@ -627,7 +626,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/service:hlo_matchers",
         "//tensorflow/compiler/xla/service:hlo_parser",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",  # build_cleaner: keep
     ],
 )
@@ -849,7 +848,6 @@ tf_cc_test(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
         "//tensorflow/compiler/xla/tests:test_utils",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "@com_google_absl//absl/memory",
@@ -909,7 +907,6 @@ tf_cc_test(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
@@ -1036,6 +1033,6 @@ tf_cc_test(
         "//tensorflow/compiler/xla/service:hlo_matchers",
         "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/service:pattern_matcher",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
     ],
 )
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_conv_pad_for_tensor_cores_test.cc b/tensorflow/compiler/xla/service/gpu/cudnn_conv_pad_for_tensor_cores_test.cc
index fa3afa6a5d3..af9303a5b76 100644
--- a/tensorflow/compiler/xla/service/gpu/cudnn_conv_pad_for_tensor_cores_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_conv_pad_for_tensor_cores_test.cc
@@ -19,7 +19,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
 #include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/status_macros.h"
-#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/util.h"
 
 namespace xla {
@@ -29,7 +29,7 @@ namespace {
 namespace op = xla::testing::opcode_matchers;
 using ::testing::_;
 
-class CudnnConvPadForTensorCoresTest : public HloVerifiedTestBase {};
+class CudnnConvPadForTensorCoresTest : public HloTestBase {};
 
 TEST_F(CudnnConvPadForTensorCoresTest, PadF16ForwardConvInputChannels) {
   auto module = ParseAndReturnVerifiedModule(R"(
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter.cc b/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter.cc
index c46672c598b..4ce877f62a5 100644
--- a/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter.cc
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter.cc
@@ -254,7 +254,7 @@ MatchBackwardInput(HloInstruction* conv) {
   const auto no_match_result =
       std::make_tuple(false, Window(), ConvolutionDimensionNumbers(), nullptr);
 
-  // TODO(b/31709653): Theoretically cuDNN supports grouped convolutions also
+  // TODO(b/119479517): Theoretically cuDNN supports grouped convolutions also
   // for the backward input convolution, but at least for now with version 7.1.4
   // it is slower. This needs to be re-evaluated for future cuDNN versions.
   // Note that we already have the necessary code down below, the only thing to
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter_test.cc b/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter_test.cc
index 87a835f2504..443883a89f6 100644
--- a/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter_test.cc
@@ -24,7 +24,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/shape_inference.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
-#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace xla {
@@ -34,11 +34,11 @@ namespace {
 namespace op = xla::testing::opcode_matchers;
 using ::testing::_;
 
-class CudnnConvRewriterTest : public HloVerifiedTestBase {
+class CudnnConvRewriterTest : public HloTestBase {
  public:
   CudnnConvRewriterTest()
-      : HloVerifiedTestBase(/*layout_sensitive=*/true,
-                            /*allow_mixed_precision=*/false) {
+      : HloTestBase(/*layout_sensitive=*/true,
+                    /*allow_mixed_precision=*/false) {
     for (int i = 0; i < 2; ++i) {
       WindowDimension* window_dim = default_conv_window_.add_dimensions();
       window_dim->set_size(1);
@@ -118,10 +118,10 @@ TEST_F(CudnnConvRewriterTest, BackwardFilterConvolve) {
   metadata.set_op_name("foo");
   conv->set_metadata(metadata);
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation* entry_computation =
       module->AddEntryComputation(builder.Build());
-  EXPECT_TRUE(RunPass(module));
+  EXPECT_TRUE(RunPass(module.get()));
   ASSERT_THAT(entry_computation->root_instruction(),
               op::GetTupleElement(
                   op::CustomCall(kCudnnConvBackwardFilterCallTarget), 0));
@@ -152,10 +152,10 @@ TEST_F(CudnnConvRewriterTest,
       activations, gradients, /*feature_group_count=*/1, conv_window,
       tf_default_dnums_for_backward_filter_, DefaultPrecisionConfig(2)));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation* entry_computation =
       module->AddEntryComputation(builder.Build());
-  EXPECT_TRUE(RunPass(module));
+  EXPECT_TRUE(RunPass(module.get()));
   EXPECT_THAT(entry_computation->root_instruction(),
               op::GetTupleElement(
                   op::CustomCall(kCudnnConvBackwardFilterCallTarget), 0));
@@ -182,10 +182,10 @@ TEST_F(CudnnConvRewriterTest, BackwardFilterConvolveWithPaddedActivations) {
       /*feature_group_count=*/1, conv_window,
       tf_default_dnums_for_backward_filter_, DefaultPrecisionConfig(2)));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation* entry_computation =
       module->AddEntryComputation(builder.Build());
-  EXPECT_TRUE(RunPass(module));
+  EXPECT_TRUE(RunPass(module.get()));
   EXPECT_THAT(entry_computation->root_instruction(),
               op::GetTupleElement(
                   op::CustomCall(kCudnnConvBackwardFilterCallTarget), 0));
@@ -212,10 +212,10 @@ TEST_F(CudnnConvRewriterTest, BackwardFilterConvolveWithPaddedGradients) {
       /*feature_group_count=*/1, conv_window,
       tf_default_dnums_for_backward_filter_, DefaultPrecisionConfig(2)));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation* entry_computation =
       module->AddEntryComputation(builder.Build());
-  EXPECT_TRUE(RunPass(module));
+  EXPECT_TRUE(RunPass(module.get()));
   EXPECT_THAT(entry_computation->root_instruction(),
               op::GetTupleElement(
                   op::CustomCall(kCudnnConvBackwardFilterCallTarget), 0));
@@ -241,10 +241,10 @@ TEST_F(CudnnConvRewriterTest, BackwardFilterConvolveWithUnevenPadding) {
       /*feature_group_count=*/1, conv_window,
       tf_default_dnums_for_backward_filter_, DefaultPrecisionConfig(2)));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation* entry_computation =
       module->AddEntryComputation(builder.Build());
-  EXPECT_TRUE(RunPass(module));
+  EXPECT_TRUE(RunPass(module.get()));
   EXPECT_THAT(entry_computation->root_instruction(),
               op::GetTupleElement(
                   op::CustomCall(kCudnnConvBackwardFilterCallTarget), 0));
@@ -292,10 +292,10 @@ TEST_F(CudnnConvRewriterTest, BackwardInputConvolveEvenPadding) {
                          /*feature_group_count=*/1, conv_window, conv_dnums)
                          .ValueOrDie()));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation* entry_computation =
       module->AddEntryComputation(builder.Build());
-  EXPECT_TRUE(RunPass(module));
+  EXPECT_TRUE(RunPass(module.get()));
 
   ASSERT_THAT(entry_computation->root_instruction(),
               op::GetTupleElement(
@@ -338,10 +338,10 @@ TEST_F(CudnnConvRewriterTest, BackwardInputConvolve1x1Filter) {
       /*lhs=*/output, /*rhs=*/kernel, /*feature_group_count=*/1, conv_window,
       tf_default_dnums_for_backward_input_, DefaultPrecisionConfig(2)));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation* entry_computation =
       module->AddEntryComputation(builder.Build());
-  EXPECT_TRUE(RunPass(module));
+  EXPECT_TRUE(RunPass(module.get()));
   EXPECT_THAT(entry_computation->root_instruction(),
               op::GetTupleElement(
                   op::CustomCall(kCudnnConvBackwardInputCallTarget), 0));
@@ -371,10 +371,10 @@ TEST_F(CudnnConvRewriterTest,
       default_conv_window_, tf_default_dnums_for_backward_input_,
       DefaultPrecisionConfig(2)));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation* entry_computation =
       module->AddEntryComputation(builder.Build());
-  EXPECT_TRUE(RunPass(module));
+  EXPECT_TRUE(RunPass(module.get()));
   EXPECT_THAT(
       entry_computation->root_instruction(),
       op::GetTupleElement(op::CustomCall(kCudnnConvForwardCallTarget), 0));
@@ -425,10 +425,10 @@ TEST_F(CudnnConvRewriterTest, BackwardInputConvolveUnevenPaddingOnGradients) {
           conv_window, tf_default_dnums_for_backward_input_)
           .ValueOrDie()));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation* entry_computation =
       module->AddEntryComputation(builder.Build());
-  EXPECT_TRUE(RunPass(module));
+  EXPECT_TRUE(RunPass(module.get()));
   ASSERT_THAT(entry_computation->root_instruction(),
               op::GetTupleElement(
                   op::CustomCall(kCudnnConvBackwardInputCallTarget), 0));
@@ -475,10 +475,10 @@ TEST_F(CudnnConvRewriterTest, BackwardInputConvolveLowPaddingTooLarge) {
           conv_window, tf_default_dnums_for_backward_input_)
           .ValueOrDie()));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation* entry_computation =
       module->AddEntryComputation(builder.Build());
-  EXPECT_TRUE(RunPass(module));
+  EXPECT_TRUE(RunPass(module.get()));
   EXPECT_THAT(
       entry_computation->root_instruction(),
       op::GetTupleElement(op::CustomCall(kCudnnConvForwardCallTarget), 0));
@@ -529,10 +529,10 @@ TEST_F(CudnnConvRewriterTest, BackwardInputConvolveUnevenPaddingOnActivations) {
           conv_window, tf_default_dnums_for_backward_input_)
           .ValueOrDie()));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   const HloComputation* entry_computation =
       module->AddEntryComputation(builder.Build());
-  EXPECT_TRUE(RunPass(module));
+  EXPECT_TRUE(RunPass(module.get()));
   ASSERT_THAT(entry_computation->root_instruction(),
               op::GetTupleElement(
                   op::CustomCall(kCudnnConvBackwardInputCallTarget), 0));
@@ -584,10 +584,10 @@ TEST_F(CudnnConvRewriterTest,
           conv_window, tf_default_dnums_for_backward_input_)
           .ValueOrDie()));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation* entry_computation =
       module->AddEntryComputation(builder.Build());
-  EXPECT_TRUE(RunPass(module));
+  EXPECT_TRUE(RunPass(module.get()));
   EXPECT_THAT(
       entry_computation->root_instruction(),
       op::GetTupleElement(op::CustomCall(kCudnnConvForwardCallTarget), 0));
@@ -600,7 +600,8 @@ TEST_F(CudnnConvRewriterTest, BackwardInputConvolveConstantFilter) {
   constant_arr.FillIota(0);
   string constant_str =
       LiteralUtil::CreateR4FromArray4D(constant_arr).ToString();
-  ParseAndVerifyModule(absl::StrFormat(R"(
+
+  const string module_str = absl::StrFormat(R"(
     HloModule test
 
     ENTRY entry_computation {
@@ -610,10 +611,12 @@ TEST_F(CudnnConvRewriterTest, BackwardInputConvolveConstantFilter) {
           window={size=4x4 pad=2_2x2_2 lhs_dilate=2x2},
           dim_labels=bf01_01oi->bf01, feature_group_count=1
     })",
-                                       constant_str));
-  EXPECT_TRUE(RunPass(&module()));
+                                            constant_str);
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(module_str));
+
+  EXPECT_TRUE(RunPass(m.get()));
   EXPECT_THAT(
-      module().entry_computation()->root_instruction(),
+      m->entry_computation()->root_instruction(),
       op::GetTupleElement(op::CustomCall(kCudnnConvBackwardInputCallTarget, _,
                                          op::Reverse(op::Constant())),
                           0));
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule.cc b/tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule.cc
index 02a0d028c11..91609c730b6 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule.cc
@@ -42,7 +42,7 @@ class GpuHloOrdering : public PredecessorHloOrdering {
 
   // Only the entry computation can possibly be sequentially ordered, and only
   // if we've assigned all instructions to a single stream.
-  const std::vector<const HloInstruction*>* SequentialOrder(
+  const HloInstructionSequence* SequentialOrder(
       const HloComputation& computation) const override {
     return &computation == module_->entry_computation() ? entry_sequence_.get()
                                                         : nullptr;
@@ -51,7 +51,7 @@ class GpuHloOrdering : public PredecessorHloOrdering {
   string ToString() const override { return ToStringHelper("GpuHloOrdering"); }
 
  private:
-  std::unique_ptr<std::vector<const HloInstruction*>> entry_sequence_;
+  std::unique_ptr<HloInstructionSequence> entry_sequence_;
 };
 
 GpuHloOrdering::GpuHloOrdering(
@@ -60,8 +60,8 @@ GpuHloOrdering::GpuHloOrdering(
     : PredecessorHloOrdering(module) {
   // The entry computation has a total order when there's only one stream.
   if (stream_assignment.StreamCount() == 1) {
-    entry_sequence_ = absl::make_unique<std::vector<const HloInstruction*>>(
-        thunk_launch_order);
+    entry_sequence_ =
+        absl::make_unique<HloInstructionSequence>(thunk_launch_order);
   }
 
   // The ordering of instructions for the entry computation is determined by the
@@ -124,7 +124,8 @@ GpuHloOrdering::GpuHloOrdering(
   for (auto* computation : module->computations()) {
     if (computation != module->entry_computation() &&
         !computation->IsFusionComputation()) {
-      predecessors_.emplace(computation, computation->ComputeReachability());
+      predecessors_.emplace(computation,
+                            HloReachabilityMap::Build(computation));
     }
   }
 }
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule_test.cc b/tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule_test.cc
index b857fa775a7..6d3aed15ebe 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule_test.cc
@@ -24,14 +24,14 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
-#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/tests/test_utils.h"
 #include "tensorflow/compiler/xla/types.h"
 
 namespace xla {
 namespace gpu {
 
-class GpuHloScheduleTest : public HloVerifiedTestBase {
+class GpuHloScheduleTest : public HloTestBase {
  protected:
   using HloVec = std::vector<const HloInstruction*>;
 
@@ -44,7 +44,7 @@ class GpuHloScheduleTest : public HloVerifiedTestBase {
         .ConsumeValueOrDie();
   }
 
-  std::unique_ptr<HloModule> CreateNewModule() {
+  std::unique_ptr<HloModule> CreateNewUnverifiedModule() {
     HloModuleConfig config;
     auto debug_options = GetDebugOptionsForTest();
     debug_options.set_xla_gpu_disable_multi_streaming(false);
@@ -79,7 +79,7 @@ TEST_F(GpuHloScheduleTest, SequentialMatMul) {
   HloInstruction* dot2 =
       builder.AddInstruction(CreateCanonicalDot(f32_2x2_, dot1, z));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   module->AddEntryComputation(builder.Build(dot2));
 
   std::unique_ptr<StreamAssignment> streams = AssignStreams(*module);
@@ -139,7 +139,7 @@ TEST_F(GpuHloScheduleTest, SequentialAdd) {
   HloInstruction* add3 = builder.AddInstruction(
       HloInstruction::CreateBinary(f32_2x2_, HloOpcode::kAdd, add1, add2));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   module->AddEntryComputation(builder.Build(add3));
 
   std::unique_ptr<StreamAssignment> streams = AssignStreams(*module);
@@ -209,7 +209,7 @@ TEST_F(GpuHloScheduleTest, ConcurrentMatMul) {
   HloInstruction* add =
       builder.AddInstruction(CreateCanonicalDot(f32_2x2_, dot1, dot2));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   module->AddEntryComputation(builder.Build(add));
 
   std::unique_ptr<StreamAssignment> streams = AssignStreams(*module);
@@ -288,7 +288,7 @@ TEST_F(GpuHloScheduleTest, LatticeMatMul) {
   HloInstruction* d40 =
       builder.AddInstruction(CreateCanonicalDot(f32_2x2_, d30, d31));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   module->AddEntryComputation(builder.Build(d40));
 
   std::unique_ptr<StreamAssignment> streams = AssignStreams(*module);
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_hlo_support_checker_test.cc b/tensorflow/compiler/xla/service/gpu/gpu_hlo_support_checker_test.cc
index 7d01eeb0256..b511155f85f 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_hlo_support_checker_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_hlo_support_checker_test.cc
@@ -16,7 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/gpu/gpu_hlo_support_checker.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
-#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/core/lib/core/error_codes.pb.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 
@@ -25,7 +25,7 @@ namespace {
 
 using ::testing::HasSubstr;
 
-class GpuHloSupportCheckerTest : public HloVerifiedTestBase {
+class GpuHloSupportCheckerTest : public HloTestBase {
  protected:
   GpuHloSupportChecker& checker() { return checker_; }
 
@@ -42,10 +42,10 @@ TEST_F(GpuHloSupportCheckerTest, Add) {
       HloInstruction::CreateParameter(1, scalar_shape, "param1"));
   builder.AddInstruction(HloInstruction::CreateBinary(
       scalar_shape, HloOpcode::kAdd, param0, param1));
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
-  TF_ASSERT_OK(checker().Run(module).status());
+  TF_ASSERT_OK(checker().Run(module.get()).status());
 }
 
 TEST_F(GpuHloSupportCheckerTest, SparseUnimplemented) {
@@ -60,7 +60,7 @@ TEST_F(GpuHloSupportCheckerTest, SparseUnimplemented) {
   // Since verifier is reporting sparse layouts as errors, we should
   // use a regular HloModule instead of VerifiedHloModule to avoid
   // verifier errors being triggered in the destructor.
-  auto module = HloTestBase::CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   module->AddEntryComputation(builder.Build());
 
   Status status = checker().Run(module.get()).status();
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment_test.cc b/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment_test.cc
index 4822b820f4e..8cc76c872c6 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment_test.cc
@@ -61,7 +61,7 @@ TEST_F(LayoutAssignmentTest, Elementwise) {
             HloInstruction::CreateParameter(1, ashape, "y"));
         auto add = builder.AddInstruction(
             HloInstruction::CreateBinary(ashape, HloOpcode::kAdd, x, y));
-        auto module = CreateNewModule();
+        auto module = CreateNewUnverifiedModule();
         HloComputation* computation =
             module->AddEntryComputation(builder.Build(add));
 
@@ -148,7 +148,7 @@ TEST_F(LayoutAssignmentTest, BatchNormInference) {
           {operand, scale, offset, mean, variance, epsilon, feature_index},
           kCudnnBatchNormForwardInferenceCallTarget));
 
-      auto module = CreateNewModule();
+      auto module = CreateNewUnverifiedModule();
       HloComputation* computation =
           module->AddEntryComputation(builder.Build(batchnorm));
 
@@ -217,7 +217,7 @@ TEST_F(LayoutAssignmentTest, BatchNormTraining) {
           batchnorm_shape, {operand, scale, offset, epsilon, feature_index},
           kCudnnBatchNormForwardTrainingCallTarget));
 
-      auto module = CreateNewModule();
+      auto module = CreateNewUnverifiedModule();
       HloComputation* computation =
           module->AddEntryComputation(builder.Build(batchnorm));
 
@@ -298,7 +298,7 @@ TEST_F(LayoutAssignmentTest, BatchNormGrad) {
                  feature_index},
                 kCudnnBatchNormBackwardCallTarget));
 
-        auto module = CreateNewModule();
+        auto module = CreateNewUnverifiedModule();
         HloComputation* computation =
             module->AddEntryComputation(builder.Build(batchnorm));
 
diff --git a/tensorflow/compiler/xla/service/gpu/instruction_fusion.cc b/tensorflow/compiler/xla/service/gpu/instruction_fusion.cc
index 7f2b59810f0..43f43b50e4a 100644
--- a/tensorflow/compiler/xla/service/gpu/instruction_fusion.cc
+++ b/tensorflow/compiler/xla/service/gpu/instruction_fusion.cc
@@ -47,6 +47,7 @@ bool IsFusible(const HloInstruction& hlo) {
          hlo.opcode() == HloOpcode::kReduce ||
          hlo.opcode() == HloOpcode::kReduceWindow ||
          hlo.opcode() == HloOpcode::kReshape ||
+         hlo.opcode() == HloOpcode::kReverse ||
          hlo.opcode() == HloOpcode::kScatter ||
          hlo.opcode() == HloOpcode::kSlice ||
          hlo.opcode() == HloOpcode::kTranspose;
diff --git a/tensorflow/compiler/xla/service/gpu/instruction_fusion_test.cc b/tensorflow/compiler/xla/service/gpu/instruction_fusion_test.cc
index 57e66f5a12c..fb77bc4b8eb 100644
--- a/tensorflow/compiler/xla/service/gpu/instruction_fusion_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/instruction_fusion_test.cc
@@ -41,7 +41,7 @@ TEST_F(InstructionFusionTest,
       builder.AddInstruction(HloInstruction::CreateBroadcast(
           ShapeUtil::MakeShape(S32, {1}), exp1, {0}));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
   EXPECT_EQ(broadcast2, computation->root_instruction());
   EXPECT_FALSE(GpuInstructionFusion(/*may_duplicate=*/true)
@@ -61,7 +61,7 @@ TEST_F(InstructionFusionTest,
       builder.AddInstruction(HloInstruction::CreateBroadcast(
           ShapeUtil::MakeShape(S32, {1}), negate1, {0}));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
   EXPECT_EQ(broadcast2, computation->root_instruction());
   EXPECT_TRUE(GpuInstructionFusion(/*may_duplicate=*/true)
@@ -80,7 +80,7 @@ TEST_F(InstructionFusionTest,
   HloInstruction* reshape2 = builder.AddInstruction(
       HloInstruction::CreateReshape(ShapeUtil::MakeShape(S32, {}), exp1));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
   EXPECT_EQ(reshape2, computation->root_instruction());
   EXPECT_TRUE(GpuInstructionFusion(/*may_duplicate=*/true)
@@ -99,7 +99,7 @@ TEST_F(InstructionFusionTest,
   HloInstruction* transpose2 = builder.AddInstruction(
       HloInstruction::CreateTranspose(ShapeUtil::MakeShape(S32, {}), exp1, {}));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
   EXPECT_EQ(transpose2, computation->root_instruction());
   EXPECT_TRUE(GpuInstructionFusion(/*may_duplicate=*/true)
@@ -117,7 +117,7 @@ TEST_F(InstructionFusionTest, PotentialBitcastReshapeOfDotUnfused) {
   auto reshape2 = builder.AddInstruction(HloInstruction::CreateReshape(
       ShapeUtil::MakeShape(S32, {1, 1, 1}), dot1));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
   EXPECT_EQ(reshape2, computation->root_instruction());
   EXPECT_FALSE(GpuInstructionFusion(/*may_duplicate=*/true)
@@ -134,7 +134,7 @@ TEST_F(InstructionFusionTest, PotentialBitcastTransposeOfDotUnfused) {
   auto transpose2 = builder.AddInstruction(HloInstruction::CreateTranspose(
       ShapeUtil::MakeShape(S32, {1, 1}), dot1, {0, 1}));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
   EXPECT_EQ(transpose2, computation->root_instruction());
   EXPECT_FALSE(GpuInstructionFusion(/*may_duplicate=*/true)
@@ -723,7 +723,7 @@ TEST_F(InstructionFusionTest, AvoidsLargeFusion) {
     sum = b.AddInstruction(
         HloInstruction::CreateBinary(shape, HloOpcode::kAdd, sum, param));
   }
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   auto computation = module->AddEntryComputation(b.Build());
   EXPECT_TRUE(GpuInstructionFusion(/*may_duplicate=*/true)
                   .Run(module.get())
@@ -805,5 +805,26 @@ TEST_F(InstructionFusionTest, NonscalarConstantsNotFused) {
               op::Reduce(op::Broadcast(op::Parameter()), op::Constant()));
 }
 
+TEST_F(InstructionFusionTest, FuseReverse) {
+  auto module = ParseHloString(R"(
+    HloModule test_module
+
+    ENTRY Reverse {
+      p0 = f32[50,96,1024]{2,1,0} parameter(0)
+      add = f32[50,96,1024]{2,1,0} add(p0, p0)
+      ROOT reverse = f32[50,96,1024] reverse(add), dimensions={0}
+    })")
+                    .ValueOrDie();
+
+  EXPECT_TRUE(GpuInstructionFusion(/*may_duplicate=*/true)
+                  .Run(module.get())
+                  .ValueOrDie());
+
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, op::Fusion());
+  EXPECT_THAT(root->fused_expression_root(),
+              op::Reverse(op::Add(op::Parameter(), op::Parameter())));
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
index 21e44e1e7d3..87b6cd640ac 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@@ -2197,9 +2197,10 @@ Status IrEmitterUnnested::HandleSort(HloInstruction* sort) {
   }
 
   int64 dimension_to_sort = sort->dimensions(0);
-  int64 dimension_to_sort_bound = keys_shape.dimensions(dimension_to_sort);
+  uint64 dimension_to_sort_bound = keys_shape.dimensions(dimension_to_sort);
   int64 num_stages = tensorflow::Log2Ceiling(dimension_to_sort_bound);
-  auto index_type = b_.getInt64Ty();
+  CHECK_GE(1ULL << num_stages, dimension_to_sort_bound);
+  CHECK_LT(1ULL << (num_stages - 1), dimension_to_sort_bound);
 
   // Naive C++ code for the outer loops:
   //
@@ -2213,42 +2214,119 @@ Status IrEmitterUnnested::HandleSort(HloInstruction* sort) {
   //   }
   // }
   //
-  // This follows the algorithm described on Wikipedia:
-  // https://en.wikipedia.org/wiki/Bitonic_sorter
+  // This follows the alternative representation of the algorithm described on
+  // Wikipedia: https://en.wikipedia.org/wiki/Bitonic_sorter
+  //
+  // Each mask specifies how to derive from one position in the array the
+  // position with which it should be compared (we calculate the xor of the
+  // position with the mask).
+  // As an optimization, we can move the 'mask' loop to inside the
+  // sorting/comparison loop if the comparisons happen within a small block of
+  // the array. To make this work, we collect all consecutive masks that are
+  // smaller than our chosen power of 2 tile size, and pass them to SortInPlace.
+  // Each thread then processes one tile of data.
 
+  const uint64 kTileSize = std::min(2048ULL, 1ULL << num_stages);
+
+  // If we cannot combine several xor masks together, we don't use tiling, so we
+  // calculate the standard launch dimensions for the shape. However we only
+  // need to iterate through ~half of the dimension to sort (rounded up to the
+  // next highest power of 2), because each iteration compares one pair of
+  // elements.
+  Shape standard_iteration_shape = keys_shape;
+  uint64 standard_num_iterations_in_sort_dim = 1ULL << (num_stages - 1);
+  standard_iteration_shape.set_dimensions(dimension_to_sort,
+                                          standard_num_iterations_in_sort_dim);
+  LaunchDimensions standard_launch_dimensions = CalculateLaunchDimensions(
+      standard_iteration_shape, ir_emitter_context_->device_description());
+
+  // Calculate the launch dimensions for the case where we use tiling. We split
+  // the dimension that should be sorted into tiles of size 'kTileSize'. This
+  // means we first need to round 'dimension_to_sort_bound' up to be a multiple
+  // of the tile size.
+  int64 rounded_bound = RoundUpToNearest(dimension_to_sort_bound, kTileSize);
+  Shape iteration_shape = keys_shape;
+
+  // We iterate through the element pairs that should be compared.
+  uint64 num_iterations_in_sort_dim = rounded_bound / 2;
+  iteration_shape.set_dimensions(dimension_to_sort, num_iterations_in_sort_dim);
+  uint64 num_iterations = ShapeUtil::ElementsIn(iteration_shape);
+
+  // For correctness reasons we need exactly 'kTileSize' / 2 many threads per
+  // block. Each thread is responsible for copying exactly two adjacent elements
+  // into shared memory, and then does a comparison of two possibly different
+  // elements taken from shared memory.
+  const uint64 kThreadsPerBlock = kTileSize / 2;
+
+  // Check whether we should use any tiling. We might not be able to use it if
+  // we have not enough threads, or not enough shared memory. Also it does not
+  // give a speedup if the tile size is < 128.
+  int64 total_shared_memory_needed = 0;
+  for (int64 i = 0; i < sort->operand_count(); ++i) {
+    total_shared_memory_needed +=
+        kTileSize * ShapeUtil::ByteSizeOfPrimitiveType(
+                        sort->operand(i)->shape().element_type());
+  }
+  bool no_tiling =
+      kTileSize < 128 ||
+      kThreadsPerBlock >
+          ir_emitter_context_->device_description().threads_per_block_limit() ||
+      total_shared_memory_needed >
+          ir_emitter_context_->device_description().shared_memory_per_block();
+
+  uint64 num_blocks = CeilOfRatio(num_iterations, kThreadsPerBlock);
+  LaunchDimensions tiled_launch_dimensions(num_blocks, kThreadsPerBlock);
+
+  auto emit_kernel = [&](absl::Span<const int64> xor_masks) {
+    thunks.push_back(
+        BuildKernelThunk(sort, /*implements_whole_instruction=*/false));
+    LaunchDimensions launch_dimensions = xor_masks.size() > 1
+                                             ? tiled_launch_dimensions
+                                             : standard_launch_dimensions;
+    UpdateLaunchDimensions(launch_dimensions, thunks.back().get(),
+                           ir_emitter_context_->llvm_module());
+    IrArray keys_array;
+    std::vector<IrArray> values_arrays;
+    values_arrays.reserve(sort->operand_count() - 1);
+    for (int64 i = 0; i < sort->operand_count(); ++i) {
+      ShapeIndex shape_index =
+          sort->operand_count() > 1 ? ShapeIndex({i}) : ShapeIndex({});
+      if (i == 0) {
+        keys_array = GetIrArray(*sort, *sort, shape_index);
+      } else {
+        values_arrays.push_back(GetIrArray(*sort, *sort, shape_index));
+      }
+    }
+    return llvm_ir::EmitSortInPlace(
+        dimension_to_sort, keys_array, values_arrays, IrName(sort), xor_masks,
+        &b_, launch_dimensions,
+        xor_masks.size() > 1 ? num_iterations_in_sort_dim
+                             : standard_num_iterations_in_sort_dim,
+        kTileSize);
+  };
+  std::vector<int64> xor_masks;
   for (int64 stage = 0; stage < num_stages; ++stage) {
     for (int64 mask = stage; mask >= 0; --mask) {
-      thunks.push_back(
-          BuildKernelThunk(sort, /*implements_whole_instruction=*/false));
-      LaunchDimensions launch_dimensions = CalculateLaunchDimensions(
-          keys_shape, ir_emitter_context_->device_description());
-      UpdateLaunchDimensions(launch_dimensions, thunks.back().get(),
-                             ir_emitter_context_->llvm_module());
-
-      llvm::Value* xor_mask;
+      int64 xor_mask;
       if (mask == stage) {
-        xor_mask = llvm::ConstantInt::get(index_type, (1LL << (stage + 1)) - 1);
+        xor_mask = (1LL << (stage + 1)) - 1;
       } else {
-        xor_mask = llvm::ConstantInt::get(index_type, 1LL << mask);
+        xor_mask = 1LL << mask;
       }
-
-      IrArray keys_array;
-      std::vector<IrArray> values_arrays;
-      values_arrays.reserve(sort->operand_count() - 1);
-      for (int64 i = 0; i < sort->operand_count(); ++i) {
-        ShapeIndex shape_index =
-            sort->operand_count() > 1 ? ShapeIndex({i}) : ShapeIndex({});
-        if (i == 0) {
-          keys_array = GetIrArray(*sort, *sort, shape_index);
-        } else {
-          values_arrays.push_back(GetIrArray(*sort, *sort, shape_index));
+      if (xor_mask >= kTileSize || no_tiling) {
+        if (!xor_masks.empty()) {
+          TF_RETURN_IF_ERROR(emit_kernel(xor_masks));
+          xor_masks.clear();
         }
+        TF_RETURN_IF_ERROR(emit_kernel({xor_mask}));
+      } else {
+        xor_masks.push_back(xor_mask);
       }
-      TF_RETURN_IF_ERROR(llvm_ir::EmitSortInPlace(
-          dimension_to_sort, keys_array, values_arrays, IrName(sort), xor_mask,
-          &b_, &launch_dimensions));
     }
   }
+  if (!xor_masks.empty()) {
+    TF_RETURN_IF_ERROR(emit_kernel(xor_masks));
+  }
 
   AddThunkToThunkSequence(
       absl::make_unique<SequentialThunk>(std::move(thunks), sort));
@@ -3261,13 +3339,9 @@ LaunchDimensions IrEmitterUnnested::EmitHlo021Tile(
                                  param->shape().element_type(), module_),
                              kTileSize + 1),
         kTileSize);
-    const int kNVPTXSharedMemoryAddrSpace = 3;
-    auto* tile_base_ptr = new llvm::GlobalVariable(
-        *b_.GetInsertBlock()->getParent()->getParent(), tile_type,
-        /*isConstant=*/false, llvm::GlobalValue::PrivateLinkage,
-        llvm::UndefValue::get(tile_type),
-        llvm_ir::AsStringRef(IrName(hlo, StrCat("tile", id))), nullptr,
-        llvm::GlobalValue::NotThreadLocal, kNVPTXSharedMemoryAddrSpace);
+    auto* tile_base_ptr = llvm_ir::AllocateSharedMemoryTile(
+        b_.GetInsertBlock()->getParent()->getParent(), tile_type,
+        IrName(hlo, StrCat("tile", id)));
     param_shmem_buffers[id] = tile_base_ptr;
     VLOG(3) << "Added shmem buffer for parameter " << id << ": "
             << llvm_ir::DumpToString(*tile_base_ptr);
@@ -3454,6 +3528,29 @@ LaunchDimensions IrEmitterUnnested::EmitHlo021Tile(
   return launch_dimensions;
 }
 
+namespace {
+// Returns true to indicate it is safe to use the tile based shared memory
+// transpose implementation to implement the kernel for the instruction.
+//
+// An instruction is not safe for such an implementation if it can change the
+// element order of a tensor without changing the dimension of the tensor, and
+// the instruction has a corresponding elemental_ir_emitter.
+bool IsInstructionSafeForTileBasedTranspose(const HloInstruction* hlo) {
+  auto is_safe_for_tile_based_transpose = [&](const HloInstruction* instr) {
+    HloOpcode opcode = instr->opcode();
+    CHECK_NE(opcode, HloOpcode::kFusion);
+    return (opcode != HloOpcode::kReverse && opcode != HloOpcode::kGather);
+  };
+
+  if (hlo->opcode() == HloOpcode::kFusion) {
+    return absl::c_all_of(hlo->fused_instructions_computation()->instructions(),
+                          is_safe_for_tile_based_transpose);
+  }
+
+  return is_safe_for_tile_based_transpose(hlo);
+}
+}  // namespace
+
 bool IrEmitterUnnested::CheckAndEmitHloWithTile021(HloInstruction* hlo) {
   HloOpcode opcode = hlo->opcode();
   CHECK(opcode == HloOpcode::kFusion || opcode == HloOpcode::kCopy);
@@ -3498,6 +3595,10 @@ bool IrEmitterUnnested::CheckAndEmitHloWithTile021(HloInstruction* hlo) {
     return false;
   }
 
+  if (!IsInstructionSafeForTileBasedTranspose(hlo)) {
+    return false;
+  }
+
   // Each of our shared memory tiles has 32*33 elements (so ~4kb, if the
   // elements are of size 4 bytes), and CUDA has an architectural limit of 48kb
   // shared memory per SM.  (This is increased to 96kb in Volta, but we don't
diff --git a/tensorflow/compiler/xla/service/gpu/multi_output_fusion.cc b/tensorflow/compiler/xla/service/gpu/multi_output_fusion.cc
index 9427d3d54ad..d9b06828e2b 100644
--- a/tensorflow/compiler/xla/service/gpu/multi_output_fusion.cc
+++ b/tensorflow/compiler/xla/service/gpu/multi_output_fusion.cc
@@ -140,6 +140,18 @@ bool GpuMultiOutputFusion::LegalToFuse(HloInstruction* instr1,
     return false;
   }
 
+  // The emitter only supports in-place DUS for fusions with a single DUS at the
+  // root. Don't sibling fuse DUS for now.
+  // TODO(b/119178699): Multi-output fusing DUS can improve performance if we
+  // share the input and output buffers and add support to the emitter.
+  if (instr1->fused_expression_root()->opcode() ==
+          HloOpcode::kDynamicUpdateSlice ||
+      (instr2->opcode() == HloOpcode::kFusion &&
+       instr2->fused_expression_root()->opcode() ==
+           HloOpcode::kDynamicUpdateSlice)) {
+    return false;
+  }
+
   // Do this check last, as it may be expensive.
   return !GpuInstructionFusion::FusionWouldBeTooLarge(instr1, instr2);
 }
diff --git a/tensorflow/compiler/xla/service/gpu/multi_output_fusion_test.cc b/tensorflow/compiler/xla/service/gpu/multi_output_fusion_test.cc
index 1d4856e0cae..dc221f22a74 100644
--- a/tensorflow/compiler/xla/service/gpu/multi_output_fusion_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/multi_output_fusion_test.cc
@@ -580,7 +580,7 @@ TEST_F(MultiOutputFusionTest, AvoidsLargeFusion) {
   //   ...
   // where each of the (pi * pj)'s is represented as a fusion node so that
   // multi-output fusion will pay attention to it.
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   HloComputation::Builder b(TestName());
   Shape shape = ShapeUtil::MakeShape(F32, {10, 100});
 
@@ -621,5 +621,39 @@ TEST_F(MultiOutputFusionTest, AvoidsLargeFusion) {
   }
 }
 
+TEST_F(MultiOutputFusionTest, MultiOutputFusionDUS) {
+  auto module = ParseHloString(R"(HloModule dus_mof
+    fusion.1 {
+      p.0 = f16[50,96,1024]{2,1,0} parameter(0)
+      p.1 = s32[1]{0} parameter(1)
+      p.2 = f16[1,96,1024]{2,1,0} parameter(2)
+      c.0 = s32[] constant(0)
+      pad = s32[3]{0} pad(p.1, c.0), padding=0_2
+      ROOT %dynamic-update-slice = f16[50,96,1024]{2,1,0} dynamic-update-slice(p.0, p.2, pad)
+    }
+
+    fusion.2 {
+      p.0 = f16[50,96,1024]{2,1,0} parameter(0)
+      p.1 = s32[1]{0} parameter(1)
+      p.2 = f16[1,96,1024]{2,1,0} parameter(2)
+      c.0 = s32[] constant(0)
+      pad = s32[3]{0} pad(p.1, c.0), padding=0_2
+      ROOT %dynamic-update-slice = f16[50,96,1024]{2,1,0} dynamic-update-slice(p.0, p.2, pad)
+    }
+
+    ENTRY entry {
+      p.00 = f16[50,96,1024]{2,1,0} parameter(0)
+      p.01 = f16[50,96,1024]{2,1,0} parameter(1)
+      p.1 = s32[1]{0} parameter(2)
+      p.2 = f16[1,96,1024]{2,1,0} parameter(3)
+
+      f1 = f16[50,96,1024] fusion(p.00, p.1, p.2), kind=kLoop, calls=fusion.1
+      f2 = f16[50,96,1024] fusion(p.01, p.1, p.2), kind=kLoop, calls=fusion.2
+      ROOT tuple = (f16[50,96,1024],f16[50,96,1024]) tuple(f1, f2)
+    })")
+                    .ValueOrDie();
+  ASSERT_FALSE(GpuMultiOutputFusion().Run(module.get()).ValueOrDie());
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/stream_assignment.cc b/tensorflow/compiler/xla/service/gpu/stream_assignment.cc
index 5b6cf2c04d0..4775baf44ae 100644
--- a/tensorflow/compiler/xla/service/gpu/stream_assignment.cc
+++ b/tensorflow/compiler/xla/service/gpu/stream_assignment.cc
@@ -122,7 +122,7 @@ std::unique_ptr<StreamAssignment> AssignStreams(const HloModule& module) {
   auto stream_assignment = absl::make_unique<StreamAssignment>();
   const HloComputation& computation = *module.entry_computation();
   std::unique_ptr<HloReachabilityMap> reachability =
-      computation.ComputeReachability();
+      HloReachabilityMap::Build(&computation);
   std::vector<const HloInstruction*> seen_gemms;
   // The execution of different RNG Hlo instructions in the same module updates
   // a common global variable. To avoid a race condition, we simply assign all
diff --git a/tensorflow/compiler/xla/service/gpu/stream_assignment_test.cc b/tensorflow/compiler/xla/service/gpu/stream_assignment_test.cc
index c4f43cc9a61..f2ef11e1e6a 100644
--- a/tensorflow/compiler/xla/service/gpu/stream_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/stream_assignment_test.cc
@@ -21,16 +21,16 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
-#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/tests/test_utils.h"
 #include "tensorflow/compiler/xla/types.h"
 
 namespace xla {
 namespace gpu {
 
-class StreamAssignmentTest : public HloVerifiedTestBase {
+class StreamAssignmentTest : public HloTestBase {
  protected:
-  std::unique_ptr<HloModule> CreateNewModule() {
+  std::unique_ptr<HloModule> CreateNewUnverifiedModule() {
     HloModuleConfig config;
     auto debug_options = GetDebugOptionsForTest();
     debug_options.set_xla_gpu_disable_multi_streaming(false);
@@ -55,7 +55,7 @@ TEST_F(StreamAssignmentTest, SequentialMatMul) {
   HloInstruction* dot2 =
       builder.AddInstruction(CreateCanonicalDot(f32_2x2_, dot1, z));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   module->AddEntryComputation(builder.Build(dot2));
 
   std::unique_ptr<StreamAssignment> assignment = AssignStreams(*module);
@@ -76,7 +76,7 @@ TEST_F(StreamAssignmentTest, ConcurrentMatMul) {
   HloInstruction* add = builder.AddInstruction(
       HloInstruction::CreateBinary(f32_2x2_, HloOpcode::kAdd, dot1, dot2));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   module->AddEntryComputation(builder.Build(add));
 
   std::unique_ptr<StreamAssignment> assignment = AssignStreams(*module);
@@ -120,7 +120,7 @@ TEST_F(StreamAssignmentTest, LatticeMatMul) {
   HloInstruction* d40 =
       builder.AddInstruction(CreateCanonicalDot(f32_2x2_, d30, d31));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   module->AddEntryComputation(builder.Build(d40));
 
   std::unique_ptr<StreamAssignment> assignment = AssignStreams(*module);
diff --git a/tensorflow/compiler/xla/service/gpu/tests/BUILD b/tensorflow/compiler/xla/service/gpu/tests/BUILD
index ed46f08d597..d798b316437 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/tests/BUILD
@@ -37,7 +37,7 @@ cc_library(
     hdrs = ["gpu_codegen_test.h"],
     tags = tf_cuda_tests_tags(),
     deps = [
-        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
+        "//tensorflow/compiler/xla:debug_options_flags",
         "//tensorflow/compiler/xla/service:gpu_plugin",
         "//tensorflow/compiler/xla/service/gpu:gpu_executable",
         "//tensorflow/compiler/xla/tests:filecheck",
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.cc
index 79e77d4c4d6..9e3ff8750b8 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h"
 #include "absl/memory/memory.h"
-#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
+#include "tensorflow/compiler/xla/debug_options_flags.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_executable.h"
 #include "tensorflow/compiler/xla/tests/filecheck.h"
 #include "tensorflow/core/platform/logging.h"
@@ -23,9 +23,10 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
-std::unique_ptr<HloModule> GpuCodegenTest::CreateNewModuleWithFTZ(bool ftz) {
+std::unique_ptr<HloModule> GpuCodegenTest::CreateNewUnverifiedModuleWithFTZ(
+    bool ftz) {
   HloModuleConfig config;
-  auto debug_options = legacy_flags::GetDebugOptionsFromFlags();
+  auto debug_options = GetDebugOptionsFromFlags();
   debug_options.set_xla_gpu_ftz(ftz);
   debug_options.set_xla_gpu_max_kernel_unroll_factor(1);
   // TODO(b/38354253): Change tests to use Parameters instead of Constants.
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h b/tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h
index e4a3573babb..d2f30ae7bc4 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h
@@ -26,9 +26,9 @@ namespace gpu {
 // Tests that verify IR or PTX emitted by the GPU backend is as expected.
 class GpuCodegenTest : public LlvmIrGenTestBase {
  protected:
-  // Like HloTestBase::CreateNewModule(), with a flag for configuring the ftz
-  // option.
-  std::unique_ptr<HloModule> CreateNewModuleWithFTZ(bool ftz);
+  // Like HloTestBase::CreateNewUnverifiedModule(), with a flag for configuring
+  // the ftz option.
+  std::unique_ptr<HloModule> CreateNewUnverifiedModuleWithFTZ(bool ftz);
 
   // Compiles the given HLO module to PTX and verifies the PTX matches the given
   // FileCheck pattern.  (See http://llvm.org/docs/CommandGuide/FileCheck.html).
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_copy_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_copy_test.cc
index 780539c1642..268b48a1cad 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/gpu_copy_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_copy_test.cc
@@ -46,7 +46,7 @@ TEST_F(GpuCopyTest, UseMemcpy) {
 
   std::unique_ptr<HloComputation> computation = builder.Build();
 
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewUnverifiedModule();
   hlo_module->AddEntryComputation(std::move(computation));
 
   // There should not be any kernel prefixed "copy".
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_ftz_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_ftz_test.cc
index 177b94934c7..d0ccd8619bd 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/gpu_ftz_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_ftz_test.cc
@@ -39,7 +39,7 @@ class GpuFtzTest : public GpuCodegenTest {
         /* parameter_number=*/1, param_shape, "y"));
     builder.AddInstruction(HloInstruction::CreateBinary(param_shape, op, x, y));
 
-    auto hlo_module = CreateNewModuleWithFTZ(ftz_);
+    auto hlo_module = CreateNewUnverifiedModuleWithFTZ(ftz_);
     hlo_module->AddEntryComputation(builder.Build());
     return hlo_module;
   }
@@ -54,7 +54,7 @@ class GpuFtzTest : public GpuCodegenTest {
         /* parameter_number=*/0, param_shape, "x"));
     builder.AddInstruction(HloInstruction::CreateUnary(param_shape, op, x));
 
-    auto hlo_module = CreateNewModuleWithFTZ(ftz_);
+    auto hlo_module = CreateNewUnverifiedModuleWithFTZ(ftz_);
     hlo_module->AddEntryComputation(builder.Build());
     return hlo_module;
   }
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_index_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_index_test.cc
index a06576df7b8..da8e513a2c3 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/gpu_index_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_index_test.cc
@@ -51,7 +51,7 @@ TEST_F(GpuIndexTest, CompatibleUseLinearIndex) {
   builder.AddInstruction(HloInstruction::CreateBinary(
       ShapeUtil::MakeShape(PRED, {5, 7, 2}), HloOpcode::kGe, param_x, param_y));
 
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewUnverifiedModule();
   hlo_module->AddEntryComputation(builder.Build());
 
   // Check the optimized IR as the unoptimized IR contains dead udiv and urem.
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_kernel_tiling_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_kernel_tiling_test.cc
index 15d1e269cc2..a302b582ede 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/gpu_kernel_tiling_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_kernel_tiling_test.cc
@@ -193,6 +193,33 @@ TEST_F(GpuKernelTilingTest,
                      /*match_optimized_ir=*/true);
 }
 
+TEST_F(GpuKernelTilingTest, FusionTransposeWithReverseNotTiled) {
+  const char *const kHloString = R"(
+    HloModule FusionTransposeWithReverseNotTiled
+    fused_computation.1 {
+      arg0 = f32[128,64]{1,0} parameter(0)
+      copy0 = f32[128,64]{0,1} copy(arg0)
+      ROOT reverse0 = f32[128,64]{0,1} reverse(copy0), dimensions={0}
+    }
+
+    ENTRY reverse_break_assumption {
+      param0 = f32[128,64]{1,0} parameter(0)
+      ROOT fusion0 = f32[128,64]{0,1} fusion(param0), kind=kLoop,
+        calls=fused_computation.1
+    })";
+
+  // Check that a call to llvm.nvvm.barrier0 is not generated.
+  auto hlo_module =
+      ParseHloString(kHloString, ConfigWithoutLayoutAssignment()).ValueOrDie();
+  CompileAndVerifyIr(std::move(hlo_module),
+                     R"(
+; CHECK-LABEL: define void @fusion
+; CHECK-NOT: tail call void @llvm.nvvm.barrier0()
+; CHECK: }
+)",
+                     /*match_optimized_ir=*/true);
+}
+
 }  // namespace
 }  // namespace gpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_ldg_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_ldg_test.cc
index 6a9ecd9dae7..ea1fee040dd 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/gpu_ldg_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_ldg_test.cc
@@ -48,7 +48,7 @@ TEST_F(GpuLdgTest, LdgForParamRead) {
       HloInstruction::CreateBinary(shape, HloOpcode::kAdd, param, param));
   std::unique_ptr<HloComputation> computation = builder.Build();
 
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewUnverifiedModule();
   hlo_module->AddEntryComputation(std::move(computation));
 
   CompileAndVerifyPtx(std::move(hlo_module), R"(
@@ -73,7 +73,7 @@ TEST_F(GpuLdgTest, LdgForNonParamRead) {
   builder.AddInstruction(HloInstruction::CreateTuple({add, square}));
   std::unique_ptr<HloComputation> computation = builder.Build();
 
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewUnverifiedModule();
   hlo_module->AddEntryComputation(std::move(computation));
 
   CompileAndVerifyPtx(std::move(hlo_module), R"(
@@ -95,7 +95,7 @@ TEST_F(GpuLdgTest, LdgForNonParamRead) {
 // reduce in the foreseeable future.  But if that turns out to be wrong, I give
 // you, future reader, permission to delete this test.
 TEST_F(GpuLdgTest, NoLdgWhenSharingBuffer) {
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewUnverifiedModule();
   HloComputation::Builder builder(TestName());
 
   HloComputation* reduce_computation;
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_noalias_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_noalias_test.cc
index 15198865bda..14285459b5a 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/gpu_noalias_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_noalias_test.cc
@@ -47,7 +47,7 @@ TEST_F(GpuNoAliasTest, Concat) {
 
   std::unique_ptr<HloComputation> computation = builder.Build();
 
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewUnverifiedModule();
   hlo_module->AddEntryComputation(std::move(computation));
 
   CompileAndVerifyIr(std::move(hlo_module),
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_unrolling_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_unrolling_test.cc
index 0f2d5568caf..4636f1d9d20 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/gpu_unrolling_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_unrolling_test.cc
@@ -85,7 +85,7 @@ TEST_F(GpuUnrollingTest, UnrollFourTimes) {
 TEST_F(GpuUnrollingTest, UnrollDefaultTimes) {
   // The default unrolling factor is 4.
   HloModuleConfig config;
-  config.set_debug_options(legacy_flags::GetDebugOptionsFromFlags());
+  config.set_debug_options(GetDebugOptionsFromFlags());
   auto hlo_module = ParseHloString(kAddModule, config).ValueOrDie();
 
   CompileAndVerifyIr(std::move(hlo_module),
diff --git a/tensorflow/compiler/xla/service/gpu/variadic_op_splitter_test.cc b/tensorflow/compiler/xla/service/gpu/variadic_op_splitter_test.cc
index 5fa9e91050a..3d00ac4dc7b 100644
--- a/tensorflow/compiler/xla/service/gpu/variadic_op_splitter_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/variadic_op_splitter_test.cc
@@ -23,7 +23,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/service/pattern_matcher.h"
 #include "tensorflow/compiler/xla/shape_util.h"
-#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 
@@ -32,7 +32,7 @@ namespace gpu {
 namespace {
 using match::Concatenate;
 
-class VariadicOpSplitterTest : public HloVerifiedTestBase {};
+class VariadicOpSplitterTest : public HloTestBase {};
 
 TEST_F(VariadicOpSplitterTest, DontSplit) {
   auto module = ParseAndReturnVerifiedModule(R"(
diff --git a/tensorflow/compiler/xla/service/gpu/while_transformer_test.cc b/tensorflow/compiler/xla/service/gpu/while_transformer_test.cc
index 926b59a1b85..c7f51127649 100644
--- a/tensorflow/compiler/xla/service/gpu/while_transformer_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/while_transformer_test.cc
@@ -29,7 +29,7 @@ namespace {
 class WhileTransformerTest : public HloTestBase {
  protected:
   WhileTransformerTest()
-      : module_(CreateNewModule()),
+      : module_(CreateNewUnverifiedModule()),
         induction_variable_shape_(ShapeUtil::MakeShape(S32, {})),
         data_shape_(ShapeUtil::MakeShape(F32, {8})),
         condition_result_shape_(ShapeUtil::MakeShape(PRED, {})) {}
diff --git a/tensorflow/compiler/xla/service/heap_simulator_test.cc b/tensorflow/compiler/xla/service/heap_simulator_test.cc
index e30e7667f30..fad3215fc81 100644
--- a/tensorflow/compiler/xla/service/heap_simulator_test.cc
+++ b/tensorflow/compiler/xla/service/heap_simulator_test.cc
@@ -30,16 +30,16 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_value.h"
 #include "tensorflow/compiler/xla/service/tuple_points_to_analysis.h"
 #include "tensorflow/compiler/xla/status_macros.h"
-#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 
 namespace xla {
 namespace {
 
-class MinimumMemoryForSequenceTest : public HloVerifiedTestBase {};
+class MinimumMemoryForSequenceTest : public HloTestBase {};
 
 TEST_F(MinimumMemoryForSequenceTest, MultiComputation) {
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   const Shape scalar_shape = ShapeUtil::MakeShape(xla::F32, {});
   const Shape tuple_shape =
       ShapeUtil::MakeTupleShape({scalar_shape, scalar_shape});
@@ -86,7 +86,7 @@ TEST_F(MinimumMemoryForSequenceTest, MultiComputation) {
     return ShapeUtil::ByteSizeOf(buffer.shape(), /*pointer_size=*/8);
   };
 
-  HloSchedule schedule(module);
+  HloSchedule schedule(module.get());
   schedule.set_sequence(cond_computation,
                         {cond_param, cond_iter, cond_data, cond_lt});
   schedule.set_sequence(body_computation, {body_param});
@@ -351,7 +351,7 @@ class HeapSimulatorTracker {
   HeapSimulator::Result result_;
 };
 
-class HeapSimulatorTest : public HloVerifiedTestBase {
+class HeapSimulatorTest : public HloTestBase {
  protected:
   HeapSimulatorTest() {}
   ~HeapSimulatorTest() override {}
diff --git a/tensorflow/compiler/xla/service/hlo_alias_analysis_test.cc b/tensorflow/compiler/xla/service/hlo_alias_analysis_test.cc
index 5c8d97b2d15..7e6150e9415 100644
--- a/tensorflow/compiler/xla/service/hlo_alias_analysis_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_alias_analysis_test.cc
@@ -28,7 +28,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
-#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/logging.h"
@@ -39,17 +39,17 @@ namespace {
 
 using ::testing::UnorderedElementsAre;
 
-class HloAliasAnalysisTest : public HloVerifiedTestBase {
+class HloAliasAnalysisTest : public HloTestBase {
  protected:
-  HloAliasAnalysisTest() : HloVerifiedTestBase() {
-    module_ = CreateNewModule();
+  HloAliasAnalysisTest() : HloTestBase() {
+    module_ = CreateNewVerifiedModule();
   }
 
   // Run alias analysis on the member module. For convenience returns a
   // reference to the generated analysis stored in analysis_.
   HloAliasAnalysis& RunAnalysis() {
     hlo_graph_dumper::MaybeDumpHloModule(*module_, "Before alias analysis");
-    analysis_ = HloAliasAnalysis::Run(module_,
+    analysis_ = HloAliasAnalysis::Run(module_.get(),
                                       /*fusion_can_share_buffer=*/nullptr)
                     .ConsumeValueOrDie();
     return *analysis_;
@@ -93,7 +93,7 @@ class HloAliasAnalysisTest : public HloVerifiedTestBase {
   // never occurs, but HLO graphs with interference can be explicitly
   // constructed.
   bool AnyValuesInSameBufferInterfere() {
-    DependencyHloOrdering ordering(module_);
+    DependencyHloOrdering ordering(module_.get());
     for (const HloBuffer& buffer : analysis_->buffers()) {
       for (const HloValue* value_a : buffer.values()) {
         for (const HloValue* value_b : buffer.values()) {
@@ -110,7 +110,7 @@ class HloAliasAnalysisTest : public HloVerifiedTestBase {
     return false;
   }
 
-  HloModule* module_;
+  std::unique_ptr<HloModule> module_;
   std::unique_ptr<HloAliasAnalysis> analysis_;
 
   const Shape scalar_shape_ = ShapeUtil::MakeShape(F32, {});
@@ -638,7 +638,7 @@ TEST_F(HloAliasAnalysisTest, SequentialWhiles) {
   module_->AddEntryComputation(builder.Build());
 
   FlattenCallGraph flattener;
-  TF_ASSERT_OK(flattener.Run(module_).status());
+  TF_ASSERT_OK(flattener.Run(module_.get()).status());
 
   const HloAliasAnalysis& analysis = RunAnalysis();
 
@@ -1012,7 +1012,7 @@ TEST_F(HloAliasAnalysisTest, BitcastInterference) {
 
   const HloAliasAnalysis& analysis = RunAnalysis();
 
-  DependencyHloOrdering ordering(module_);
+  DependencyHloOrdering ordering(module_.get());
   EXPECT_FALSE(analysis.HasLiveRangeInterference(ordering));
 }
 
@@ -1054,13 +1054,13 @@ TEST_F(HloAliasAnalysisTest, WhileInterference) {
   {
     // Dependency ordering should interfere because the negate and while are
     // unordered.
-    DependencyHloOrdering ordering(module_);
+    DependencyHloOrdering ordering(module_.get());
     EXPECT_TRUE(analysis.HasLiveRangeInterference(ordering));
   }
 
   // For a sequential order, if there is interference iff the negate is after
   // the while.
-  HloSchedule schedule(module_);
+  HloSchedule schedule(module_.get());
   schedule.set_sequence(body, {body_param, body_root});
   schedule.set_sequence(condition, {cond_param, cond_root});
   {
diff --git a/tensorflow/compiler/xla/service/hlo_computation.cc b/tensorflow/compiler/xla/service/hlo_computation.cc
index b0f7cd91ad1..0c20d207ddb 100644
--- a/tensorflow/compiler/xla/service/hlo_computation.cc
+++ b/tensorflow/compiler/xla/service/hlo_computation.cc
@@ -33,6 +33,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/map_util.h"
 #include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -321,7 +322,7 @@ void HloComputation::ComputeInstructionPostOrder(
 
     // Add the operands to the stack in reverse order so the first operand is
     // processed first. This will produce a more natural ordering and a nicer
-    // result for thigns like HLO stringification.
+    // result for things like HLO stringification.
     const auto& operands = current->operands();
     for (int64 i = operands.size() - 1; i >= 0; --i) {
       dfs_stack.emplace_back(operands[i]);
@@ -739,72 +740,6 @@ Status HloComputation::ReplaceInstruction(HloInstruction* old_instruction,
   return RemoveInstructionAndUnusedOperands(old_instruction);
 }
 
-std::unique_ptr<HloReachabilityMap> HloComputation::ComputeReachability()
-    const {
-  const auto& all = MakeInstructionPostOrder();
-  auto result = absl::make_unique<HloReachabilityMap>(all);
-  auto channel_dependency_map = ComputeChannelDependencies();
-
-  std::vector<HloInstruction*> inputs;
-  for (const HloInstruction* hlo : all) {
-    inputs.assign(hlo->operands().begin(), hlo->operands().end());
-    inputs.insert(inputs.end(), hlo->control_predecessors().begin(),
-                  hlo->control_predecessors().end());
-
-    switch (hlo->opcode()) {
-      case HloOpcode::kRecvDone: {
-        auto it = channel_dependency_map.find(hlo->channel_id());
-        if (it != channel_dependency_map.end()) {
-          absl::c_copy(it->second, std::back_inserter(inputs));
-        }
-        break;
-      }
-      case HloOpcode::kCrossReplicaSum: {
-        auto all_reduce_id = hlo->all_reduce_id();
-        if (all_reduce_id) {
-          auto it = channel_dependency_map.find(all_reduce_id.value());
-          if (it != channel_dependency_map.end()) {
-            absl::c_copy(it->second, std::back_inserter(inputs));
-          }
-        }
-        break;
-      }
-      default:
-        break;
-    }
-
-    result->FastSetReachabilityToUnion(inputs, hlo);
-  }
-  return result;
-}
-
-void HloComputation::UpdateReachabilityThroughInstruction(
-    const HloInstruction* instruction, HloReachabilityMap* reachability_map) {
-  std::queue<const HloInstruction*> worklist;
-  worklist.push(instruction);
-
-  std::vector<HloInstruction*> inputs;
-
-  while (!worklist.empty()) {
-    const HloInstruction* item = worklist.front();
-    worklist.pop();
-
-    inputs.assign(item->operands().begin(), item->operands().end());
-    inputs.insert(inputs.end(), item->control_predecessors().begin(),
-                  item->control_predecessors().end());
-
-    if (reachability_map->SetReachabilityToUnion(inputs, item)) {
-      // Add immediate successors to worklist.
-      for (const HloInstruction* user : item->users()) {
-        worklist.push(user);
-      }
-      for (const HloInstruction* succ : item->control_successors()) {
-        worklist.push(succ);
-      }
-    }
-  }
-}
-
 std::vector<HloInstruction*> HloComputation::CollectUnreachableRoots() const {
   std::vector<HloInstruction*> unreachable_roots;
   for (auto* instruction : instructions()) {
@@ -911,14 +846,46 @@ std::unique_ptr<HloComputation> HloComputation::Clone(
   return CloneWithReplacements(
       /*replacements=*/std::unordered_map<const HloInstruction*,
                                           std::unique_ptr<HloInstruction>>(),
-      /*extras=*/{}, context, suffix);
+      context, suffix);
+}
+
+std::unique_ptr<HloComputation> HloComputation::CloneWithReplacementPairs(
+    std::pair<const HloInstruction*, std::unique_ptr<HloInstruction>> r1,
+    HloCloneContext* context, const string& suffix) {
+  std::unordered_map<const HloInstruction*, std::unique_ptr<HloInstruction>>
+      replacements;
+  replacements.emplace(std::move(r1));
+  return CloneWithReplacements(std::move(replacements), context, suffix);
+}
+
+std::unique_ptr<HloComputation> HloComputation::CloneWithReplacementPairs(
+    std::pair<const HloInstruction*, std::unique_ptr<HloInstruction>> r1,
+    std::pair<const HloInstruction*, std::unique_ptr<HloInstruction>> r2,
+    HloCloneContext* context, const string& suffix) {
+  std::unordered_map<const HloInstruction*, std::unique_ptr<HloInstruction>>
+      replacements;
+  replacements.emplace(std::move(r1));
+  replacements.emplace(std::move(r2));
+  return CloneWithReplacements(std::move(replacements), context, suffix);
+}
+
+std::unique_ptr<HloComputation> HloComputation::CloneWithReplacementPairs(
+    std::pair<const HloInstruction*, std::unique_ptr<HloInstruction>> r1,
+    std::pair<const HloInstruction*, std::unique_ptr<HloInstruction>> r2,
+    std::pair<const HloInstruction*, std::unique_ptr<HloInstruction>> r3,
+    HloCloneContext* context, const string& suffix) {
+  std::unordered_map<const HloInstruction*, std::unique_ptr<HloInstruction>>
+      replacements;
+  replacements.emplace(std::move(r1));
+  replacements.emplace(std::move(r2));
+  replacements.emplace(std::move(r3));
+  return CloneWithReplacements(std::move(replacements), context, suffix);
 }
 
 std::unique_ptr<HloComputation> HloComputation::CloneWithReplacements(
     std::unordered_map<const HloInstruction*, std::unique_ptr<HloInstruction>>
         replacements,
-    absl::Span<HloInstruction*> extras, HloCloneContext* context,
-    const string& suffix) {
+    HloCloneContext* context, const string& suffix) {
   std::unique_ptr<HloCloneContext> context_ptr;
   if (context == nullptr) {
     context_ptr = absl::make_unique<HloCloneContext>(parent(), suffix);
@@ -939,18 +906,50 @@ std::unique_ptr<HloComputation> HloComputation::CloneWithReplacements(
   };
 
   VLOG(1) << "Cloning " << name() << " --> " << suffix << "\n";
+
+  // We want to do a postorder walk over [replace(i) for i in instructions_].
+  // We can't reuse MakeInstructionPostOrder() for this, because that will
+  // generate a postorder of plain instructions_, and our replacements may
+  // change the postorder!
+  //
+  // The postorder we want here is simpler than what MakeInstructionPostOrder()
+  // does -- we only care about operand dependencies -- so let's just do it
+  // ourselves.
   std::vector<HloInstruction*> postorder;
-  for (HloInstruction* instr : extras) {
-    postorder.push_back(instr);
-  }
-  for (HloInstruction* instr : MakeInstructionPostOrder()) {
-    if (HloInstruction* replacement = replace(instr)) {
-      postorder.push_back(replacement);
+  absl::flat_hash_map<HloInstruction*, VisitState> visited;
+  for (const auto& instr : instructions_) {
+    std::vector<HloInstruction*> dfs_stack;
+    HloInstruction* new_instr = replace(instr.get());
+    if (!new_instr) {
+      continue;
+    }
+    dfs_stack.push_back(new_instr);
+
+    while (!dfs_stack.empty()) {
+      auto* cur = dfs_stack.back();
+      auto it = visited.find(cur);
+      if (it != visited.end()) {
+        dfs_stack.pop_back();
+        if (it->second == kVisited) {
+          continue;
+        }
+        CHECK_EQ(it->second, kVisiting);
+        postorder.push_back(cur);
+        it->second = kVisited;
+        continue;
+      }
+
+      visited.insert({cur, kVisiting});
+      for (HloInstruction* operand : cur->operands()) {
+        HloInstruction* new_operand = replace(operand);
+        if (new_operand) {
+          dfs_stack.emplace_back(new_operand);
+        }
+      }
     }
   }
 
   std::vector<std::unique_ptr<HloInstruction>> instructions;
-  std::unique_ptr<HloInstruction> new_instr;
   for (auto instr : postorder) {
     std::vector<HloInstruction*> new_operands;
     for (auto operand : instr->operands()) {
@@ -960,9 +959,8 @@ std::unique_ptr<HloComputation> HloComputation::CloneWithReplacements(
           << operand->ToString() << ", used by " << instr->ToString();
       new_operands.push_back(context->GetInstruction(replaced_operand));
     }
-    new_instr =
-        instr->CloneWithNewOperands(instr->shape(), new_operands, context);
-    instructions.push_back(std::move(new_instr));
+    instructions.push_back(
+        instr->CloneWithNewOperands(instr->shape(), new_operands, context));
   }
   Builder builder(name() + "." + suffix);
   for (auto& instr : instructions) {
diff --git a/tensorflow/compiler/xla/service/hlo_computation.h b/tensorflow/compiler/xla/service/hlo_computation.h
index dec96d11a93..fc7d2035e5b 100644
--- a/tensorflow/compiler/xla/service/hlo_computation.h
+++ b/tensorflow/compiler/xla/service/hlo_computation.h
@@ -35,7 +35,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/compiler/xla/service/hlo_clone_context.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_reachability.h"
 #include "tensorflow/compiler/xla/service/name_uniquer.h"
 #include "tensorflow/compiler/xla/shape_tree.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -215,19 +214,6 @@ class HloComputation {
   // this order, definitions of values always appear before their uses.
   std::vector<HloInstruction*> MakeInstructionPostOrder() const;
 
-  // Computes and returns the reachability between HLO instructions in the
-  // computation. The returned HloReachabilityMap is constructed such that
-  // HloReachabilityMap::IsReachable(a, b) returns true iff there exists a
-  // directed path (from producer to consumer) from 'a' to 'b'. Both data
-  // dependencies (operands) and control dependencies are considered for
-  // reachability. Trivially an instruction is reachable from itself.
-  std::unique_ptr<HloReachabilityMap> ComputeReachability() const;
-
-  // Updates the given reachability map after the immediate predecessor set
-  // (operands and control predecessors) of 'instruction' has changed.
-  void UpdateReachabilityThroughInstruction(
-      const HloInstruction* instruction, HloReachabilityMap* reachability_map);
-
   int64 instruction_count() const { return instruction_iterators_.size(); }
 
   // Creates and returns a list of the embedded computations called by this
@@ -333,14 +319,38 @@ class HloComputation {
   // the map's value to replace that instruction in the cloned computation.
   //
   // If replacements maps a key to nullptr, we remove that instruction from the
-  // new computation.
-  // If additional instructions are used by instructions in replacement map,
-  // they must be passed in post-order in the extras span.
+  // new computation.  If an element of `replacements` references an instruction
+  // that's not already in the computation, it's cloned and added to the new
+  // computation.
+  //
+  // All relevant instructions are cloned, *including* unique_ptr in the
+  // `replacements` map.
   std::unique_ptr<HloComputation> CloneWithReplacements(
       std::unordered_map<const HloInstruction*, std::unique_ptr<HloInstruction>>
           replacements,
-      absl::Span<HloInstruction*> extras, HloCloneContext* context = nullptr,
-      const string& suffix = "clone");
+      HloCloneContext* context = nullptr, const string& suffix = "clone");
+
+  // Convenience overloads for CloneWithReplacements.  You want to do
+  //
+  //   CloneWithReplacements({{a, std::move(b)}, {c, std::move(d)}})  // ERROR
+  //
+  // but that doesn't work because std::initializer_list is not movable.  These
+  // overloads let you do
+  //
+  //   CloneWithReplacementPairs({a, std::move(b)}, {c, std::move(d)});   // OK
+  //
+  std::unique_ptr<HloComputation> CloneWithReplacementPairs(
+      std::pair<const HloInstruction*, std::unique_ptr<HloInstruction>> r1,
+      HloCloneContext* context = nullptr, const string& suffix = "clone");
+  std::unique_ptr<HloComputation> CloneWithReplacementPairs(
+      std::pair<const HloInstruction*, std::unique_ptr<HloInstruction>> r1,
+      std::pair<const HloInstruction*, std::unique_ptr<HloInstruction>> r2,
+      HloCloneContext* context = nullptr, const string& suffix = "clone");
+  std::unique_ptr<HloComputation> CloneWithReplacementPairs(
+      std::pair<const HloInstruction*, std::unique_ptr<HloInstruction>> r1,
+      std::pair<const HloInstruction*, std::unique_ptr<HloInstruction>> r2,
+      std::pair<const HloInstruction*, std::unique_ptr<HloInstruction>> r3,
+      HloCloneContext* context = nullptr, const string& suffix = "clone");
 
   // Returns true if the given instruction can be removed from the computation.
   // Parameter instructions cannot be removed without violating invariants of
@@ -355,6 +365,14 @@ class HloComputation {
   // channel complete).
   bool IsRemovable(const HloInstruction* instruction);
 
+  // Returns a map from channel-id to directed dependencies of the channel
+  // instructions. For send&recv pairs it means the send instruction and for
+  // cross-replica-sum the union of the dependencies for all participating
+  // instructions.
+  using ChannelDependencyMap =
+      absl::flat_hash_map<int64, absl::InlinedVector<HloInstruction*, 1>>;
+  ChannelDependencyMap ComputeChannelDependencies() const;
+
   // Returns true if this computation has a side effect. A computation has a
   // side effect if it contains one or more instructions with a side effect.
   bool HasSideEffect() const;
@@ -410,14 +428,6 @@ class HloComputation {
   // Internal helper to collect unreachable roots.
   std::vector<HloInstruction*> CollectUnreachableRoots() const;
 
-  // Returns a map from channel-id to directed dependencies of the channel
-  // instructions. For send&recv pairs it means the send instruction and for
-  // cross-replica-sum the union of the dependencies for all participating
-  // instructions.
-  using ChannelDependencyMap =
-      absl::flat_hash_map<int64, absl::InlinedVector<HloInstruction*, 1>>;
-  ChannelDependencyMap ComputeChannelDependencies() const;
-
   enum VisitState { kVisiting, kVisited };
   void ComputeInstructionPostOrder(
       const HloComputation::ChannelDependencyMap& channel_dependency_map,
diff --git a/tensorflow/compiler/xla/service/hlo_computation_test.cc b/tensorflow/compiler/xla/service/hlo_computation_test.cc
index 2aaaef1d36d..1e7a6e197f5 100644
--- a/tensorflow/compiler/xla/service/hlo_computation_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_computation_test.cc
@@ -65,7 +65,7 @@ class HloComputationTest : public HloTestBase {
 };
 
 TEST_F(HloComputationTest, GetEmbeddedComputationsEmpty) {
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   auto negate_computation =
       module->AddEntryComputation(CreateNegateComputation());
   EXPECT_TRUE(negate_computation->MakeEmbeddedComputationsList().empty());
@@ -73,7 +73,7 @@ TEST_F(HloComputationTest, GetEmbeddedComputationsEmpty) {
 
 TEST_F(HloComputationTest, GetEmbeddedComputationsOneComputation) {
   // Create computation which calls one other computation.
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   auto negate_computation =
       module->AddEmbeddedComputation(CreateNegateComputation());
   auto map_computation =
@@ -85,7 +85,7 @@ TEST_F(HloComputationTest, GetEmbeddedComputationsOneComputation) {
 
 TEST_F(HloComputationTest, GetEmbeddedComputationsDiamond) {
   // Create computations with a diamond-shaped callgraph.
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   auto negate_computation =
       module->AddEmbeddedComputation(CreateNegateComputation());
   auto map1_computation =
@@ -119,7 +119,7 @@ TEST_F(HloComputationTest, PostOrderSingleton) {
   auto builder = HloComputation::Builder(TestName());
   auto constant = builder.AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(42.0f)));
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
   EXPECT_THAT(computation->MakeInstructionPostOrder(), ElementsAre(constant));
 }
@@ -134,7 +134,7 @@ TEST_F(HloComputationTest, PostOrderSimple) {
       HloInstruction::CreateUnary(r0f32_, HloOpcode::kNegate, constant));
   auto negate2 = builder.AddInstruction(
       HloInstruction::CreateUnary(r0f32_, HloOpcode::kNegate, negate1));
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
   EXPECT_THAT(computation->MakeInstructionPostOrder(),
               ElementsAre(constant, negate1, negate2));
@@ -151,7 +151,7 @@ TEST_F(HloComputationTest, PostOrderTrace) {
       builder.AddInstruction(HloInstruction::CreateTrace("foobar", negate1));
   auto negate2 = builder.AddInstruction(
       HloInstruction::CreateUnary(r0f32_, HloOpcode::kNegate, negate1));
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
   // Trace instructions should be at the end of the sort.
   EXPECT_THAT(computation->MakeInstructionPostOrder(),
@@ -170,7 +170,7 @@ TEST_F(HloComputationTest, PostOrderDisconnectedInstructions) {
       HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(42.0f)));
   auto constant4 = builder.AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(42.0f)));
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
   EXPECT_THAT(computation->MakeInstructionPostOrder(),
               UnorderedElementsAre(constant1, constant2, constant3, constant4));
@@ -192,7 +192,7 @@ TEST_F(HloComputationTest, PostOrderWithMultipleRoots) {
       r0f32_, HloOpcode::kAdd, constant2, constant3));
   auto add3 = builder.AddInstruction(HloInstruction::CreateBinary(
       r0f32_, HloOpcode::kAdd, constant1, constant3));
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
   auto post_order = computation->MakeInstructionPostOrder();
   EXPECT_EQ(6, post_order.size());
@@ -217,7 +217,7 @@ TEST_F(HloComputationTest, VisitWithMultipleRoots) {
                                                       constant2, constant3));
   builder.AddInstruction(HloInstruction::CreateBinary(r0f32_, HloOpcode::kAdd,
                                                       constant1, constant3));
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
   // Visitor which keeps track of which instructions have been visited.
   class TestVisitor : public DfsHloVisitorWithDefault {
@@ -257,7 +257,7 @@ TEST_F(HloComputationTest, DeepCopyArray) {
   auto builder = HloComputation::Builder(TestName());
   auto constant = builder.AddInstruction(HloInstruction::CreateConstant(
       LiteralUtil::CreateR1<float>({1.0, 2.0, 3.0})));
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
   auto copy = computation->DeepCopyInstruction(constant).ValueOrDie();
 
@@ -274,7 +274,7 @@ TEST_F(HloComputationTest, DeepCopyTuple) {
   auto tuple = builder.AddInstruction(
       HloInstruction::CreateTuple({constant1, constant2}));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
   auto tuple_copy = computation->DeepCopyInstruction(tuple).ValueOrDie();
 
@@ -376,7 +376,7 @@ TEST_F(HloComputationTest, DeepCopyToken) {
   // copied.
   auto builder = HloComputation::Builder(TestName());
   auto token = builder.AddInstruction(HloInstruction::CreateToken());
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
   auto copy = computation->DeepCopyInstruction(token).ValueOrDie();
 
@@ -393,7 +393,7 @@ TEST_F(HloComputationTest, DeepCopyTokenTuple) {
       HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(42.0)));
   auto tuple =
       builder.AddInstruction(HloInstruction::CreateTuple({token, constant}));
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
   auto copy = computation->DeepCopyInstruction(tuple).ValueOrDie();
 
@@ -412,7 +412,7 @@ TEST_F(HloComputationTest, CycleDetection) {
       HloInstruction::CreateUnary(r0f32_, HloOpcode::kNegate, constant));
   auto add = builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32_, HloOpcode::kAdd, negate, negate));
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
   // Add a control dependency to create a cycle.
   ASSERT_IS_OK(add->AddControlDependencyTo(negate));
@@ -440,7 +440,7 @@ TEST_F(HloComputationTest, RemoveInstructionWithDuplicateOperand) {
       r0f32_, HloOpcode::kAdd, dead_negate, dead_negate));
   auto negate = builder.AddInstruction(
       HloInstruction::CreateUnary(r0f32_, HloOpcode::kNegate, constant));
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
   EXPECT_EQ(4, computation->instruction_count());
   EXPECT_THAT(computation->root_instruction(), op::Negate(constant));
@@ -466,7 +466,7 @@ TEST_F(HloComputationTest, CloneWithControlDependency) {
       HloInstruction::CreateParameter(0, r0f32_, "param0"));
   auto negate = builder.AddInstruction(
       HloInstruction::CreateUnary(r0f32_, HloOpcode::kNegate, param));
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   auto computation =
       module->AddEntryComputation(builder.Build(/*root_instruction=*/add));
 
@@ -484,107 +484,6 @@ TEST_F(HloComputationTest, CloneWithControlDependency) {
   EXPECT_THAT(successors, ::testing::ElementsAre(cloned_add));
 }
 
-TEST_F(HloComputationTest, Reachability) {
-  // Test reachability of a non-trivial computation:
-  //
-  // const1    const2
-  //    |         |
-  //    | +-------+
-  //    | |       |
-  //    add ..   negate
-  //     |   .     |
-  //     |   .... exp
-  //     |         |
-  //     +---+   +-+---+
-  //         |   |     |
-  //       multiply   copy
-  //
-  // There is a control dependency from 'add' to 'exp'.
-  auto builder = HloComputation::Builder(TestName());
-  auto constant1 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0f)));
-  auto constant2 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(2.0f)));
-  auto add = builder.AddInstruction(HloInstruction::CreateBinary(
-      r0f32_, HloOpcode::kAdd, constant1, constant2));
-  auto negate = builder.AddInstruction(
-      HloInstruction::CreateUnary(r0f32_, HloOpcode::kNegate, constant2));
-  auto exp = builder.AddInstruction(
-      HloInstruction::CreateUnary(r0f32_, HloOpcode::kExp, negate));
-  auto mul = builder.AddInstruction(
-      HloInstruction::CreateBinary(r0f32_, HloOpcode::kMultiply, add, exp));
-  auto copy = builder.AddInstruction(
-      HloInstruction::CreateUnary(r0f32_, HloOpcode::kCopy, exp));
-
-  auto module = CreateNewModule();
-  auto computation =
-      module->AddEntryComputation(builder.Build(/*root_instruction=*/mul));
-
-  TF_CHECK_OK(add->AddControlDependencyTo(exp));
-  auto reachability = computation->ComputeReachability();
-
-  EXPECT_TRUE(reachability->IsReachable(constant1, constant1));
-  EXPECT_FALSE(reachability->IsReachable(constant1, constant2));
-  EXPECT_TRUE(reachability->IsReachable(constant1, add));
-  EXPECT_FALSE(reachability->IsReachable(constant1, negate));
-  EXPECT_TRUE(reachability->IsReachable(constant1, exp));
-  EXPECT_TRUE(reachability->IsReachable(constant1, mul));
-  EXPECT_TRUE(reachability->IsReachable(constant1, copy));
-
-  EXPECT_FALSE(reachability->IsReachable(constant2, constant1));
-  EXPECT_TRUE(reachability->IsReachable(constant2, constant2));
-  EXPECT_TRUE(reachability->IsReachable(constant2, add));
-  EXPECT_TRUE(reachability->IsReachable(constant2, negate));
-  EXPECT_TRUE(reachability->IsReachable(constant2, exp));
-  EXPECT_TRUE(reachability->IsReachable(constant2, mul));
-  EXPECT_TRUE(reachability->IsReachable(constant2, copy));
-
-  EXPECT_FALSE(reachability->IsReachable(exp, constant1));
-  EXPECT_FALSE(reachability->IsReachable(exp, constant2));
-  EXPECT_FALSE(reachability->IsReachable(exp, add));
-  EXPECT_FALSE(reachability->IsReachable(exp, negate));
-  EXPECT_TRUE(reachability->IsReachable(exp, exp));
-  EXPECT_TRUE(reachability->IsReachable(exp, mul));
-  EXPECT_TRUE(reachability->IsReachable(exp, copy));
-
-  EXPECT_FALSE(reachability->IsReachable(mul, constant1));
-  EXPECT_FALSE(reachability->IsReachable(mul, constant2));
-  EXPECT_FALSE(reachability->IsReachable(mul, add));
-  EXPECT_FALSE(reachability->IsReachable(mul, negate));
-  EXPECT_FALSE(reachability->IsReachable(mul, exp));
-  EXPECT_TRUE(reachability->IsReachable(mul, mul));
-  EXPECT_FALSE(reachability->IsReachable(mul, copy));
-
-  EXPECT_TRUE(reachability->IsConnected(constant1, copy));
-  EXPECT_TRUE(reachability->IsConnected(copy, constant1));
-  EXPECT_FALSE(reachability->IsConnected(negate, add));
-  EXPECT_FALSE(reachability->IsConnected(add, negate));
-
-  // Remove the control dependency then update and verify the reachability map
-  ASSERT_IS_OK(add->RemoveControlDependencyTo(exp));
-  computation->UpdateReachabilityThroughInstruction(exp, reachability.get());
-
-  EXPECT_TRUE(reachability->IsReachable(constant1, constant1));
-  EXPECT_FALSE(reachability->IsReachable(constant1, constant2));
-  EXPECT_TRUE(reachability->IsReachable(constant1, add));
-  EXPECT_FALSE(reachability->IsReachable(constant1, negate));
-  EXPECT_FALSE(reachability->IsReachable(constant1, exp));
-  EXPECT_TRUE(reachability->IsReachable(constant1, mul));
-  EXPECT_FALSE(reachability->IsReachable(constant1, copy));
-
-  // Change a use within the graph then update and verify the reachability map
-  ASSERT_IS_OK(constant2->ReplaceUseWith(negate, constant1));
-  computation->UpdateReachabilityThroughInstruction(negate, reachability.get());
-
-  EXPECT_FALSE(reachability->IsReachable(constant2, constant1));
-  EXPECT_TRUE(reachability->IsReachable(constant2, constant2));
-  EXPECT_TRUE(reachability->IsReachable(constant2, add));
-  EXPECT_FALSE(reachability->IsReachable(constant2, negate));
-  EXPECT_FALSE(reachability->IsReachable(constant2, exp));
-  EXPECT_TRUE(reachability->IsReachable(constant2, mul));
-  EXPECT_FALSE(reachability->IsReachable(constant2, copy));
-}
-
 TEST_F(HloComputationTest, Stringification) {
   const Shape s1 = ShapeUtil::MakeShape(F32, {5, 10});
   const Shape s2 = ShapeUtil::MakeShape(F32, {20, 10});
@@ -606,7 +505,7 @@ TEST_F(HloComputationTest, Stringification) {
       2, PrecisionConfig::DEFAULT);
   builder.AddInstruction(
       HloInstruction::CreateDot(sout, x, reshape, dot_dnums, precision_config));
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   auto* computation = module->AddEntryComputation(builder.Build());
 
   auto options = HloPrintOptions().set_print_metadata(false);
@@ -641,7 +540,7 @@ TEST_F(HloComputationTest, StringificationIndent) {
       2, PrecisionConfig::DEFAULT);
   builder.AddInstruction(
       HloInstruction::CreateDot(sout, x, reshape, dot_dnums, precision_config));
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   auto* computation = module->AddEntryComputation(builder.Build());
 
   auto options =
@@ -677,7 +576,7 @@ TEST_F(HloComputationTest, StringificationCanonical) {
       2, PrecisionConfig::DEFAULT);
   builder.AddInstruction(
       HloInstruction::CreateDot(sout, x, reshape, dot_dnums, precision_config));
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   auto* computation = module->AddEntryComputation(builder.Build());
 
   auto options = HloPrintOptions().set_print_metadata(false);
@@ -700,27 +599,5 @@ TEST_F(HloComputationTest, StringificationCanonical) {
   EXPECT_EQ(computation->ToString(options), expected_computation2);
 }
 
-TEST_F(HloComputationTest, ChannelReachability) {
-  const Shape shape = ShapeUtil::MakeShape(F32, {5, 7});
-  HloComputation::Builder builder("ChannelReachability");
-  auto param = builder.AddInstruction(
-      HloInstruction::CreateParameter(0, shape, "param"));
-  auto token0 = builder.AddInstruction(HloInstruction::CreateToken());
-  auto send =
-      builder.AddInstruction(HloInstruction::CreateSend(param, token0, 1));
-  auto send_done = builder.AddInstruction(HloInstruction::CreateSendDone(send));
-  auto token1 = builder.AddInstruction(HloInstruction::CreateToken());
-  auto recv =
-      builder.AddInstruction(HloInstruction::CreateRecv(shape, token1, 1));
-  auto recv_done = builder.AddInstruction(HloInstruction::CreateRecvDone(recv));
-
-  auto module = CreateNewModule();
-  auto computation = module->AddEntryComputation(builder.Build(recv_done));
-  auto reachability = computation->ComputeReachability();
-  EXPECT_TRUE(reachability->IsReachable(param, recv_done));
-  EXPECT_FALSE(reachability->IsReachable(send, recv));
-  EXPECT_FALSE(reachability->IsReachable(send_done, recv));
-}
-
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_constant_folding.cc b/tensorflow/compiler/xla/service/hlo_constant_folding.cc
index 4f898ce61c3..5e37883d3d8 100644
--- a/tensorflow/compiler/xla/service/hlo_constant_folding.cc
+++ b/tensorflow/compiler/xla/service/hlo_constant_folding.cc
@@ -52,8 +52,10 @@ StatusOr<bool> HloConstantFolding::Run(HloModule* module) {
           computation->root_instruction() != instruction) {
         continue;
       }
-      // Skip Constant, Parameter, and AfterAll operation.
-      // TODO(b/64407269): Enable Tuple once the timeout issue is resolved.
+      // Skip Constant, Parameter, Tuple, AfterAll operation.
+      // Tuple constants are not directly supported by any backends, hence
+      // folding Tuple is not useful and would in fact be expanded back into
+      // kTuple by Algebraic Simplifier.
       // TODO(b/110532604): Enable AfterAll once AfterAll requires at least one
       // operand in which case constant folding will be impossible and this
       // special case is not necessary.
@@ -63,6 +65,7 @@ StatusOr<bool> HloConstantFolding::Run(HloModule* module) {
           instruction->opcode() == HloOpcode::kAfterAll) {
         continue;
       }
+
       // Skip instructions with non-constant operands.
       if (!hlo_query::AllOperandsAreConstants(*instruction)) {
         continue;
diff --git a/tensorflow/compiler/xla/service/hlo_constant_folding_test.cc b/tensorflow/compiler/xla/service/hlo_constant_folding_test.cc
index e45f905f715..d12f920722e 100644
--- a/tensorflow/compiler/xla/service/hlo_constant_folding_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_constant_folding_test.cc
@@ -28,7 +28,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_pass_fix.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
-#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/types.h"
 
@@ -37,7 +37,7 @@ namespace op = xla::testing::opcode_matchers;
 namespace xla {
 namespace {
 
-using HloConstantFoldingTest = HloVerifiedTestBase;
+using HloConstantFoldingTest = HloTestBase;
 
 TEST_F(HloConstantFoldingTest, ConvertF32ToS64) {
   HloComputation::Builder builder(TestName());
@@ -46,13 +46,13 @@ TEST_F(HloConstantFoldingTest, ConvertF32ToS64) {
   builder.AddInstruction(
       HloInstruction::CreateConvert(ShapeUtil::MakeShape(S64, {}), input));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(), op::Convert(input));
 
   HloConstantFolding const_folder;
-  TF_ASSERT_OK_AND_ASSIGN(bool result, const_folder.Run(module));
+  TF_ASSERT_OK_AND_ASSIGN(bool result, const_folder.Run(module.get()));
   EXPECT_TRUE(result);
 
   EXPECT_THAT(computation->root_instruction(), op::Constant());
@@ -67,13 +67,13 @@ TEST_F(HloConstantFoldingTest, ConvertS64ToF32) {
   builder.AddInstruction(
       HloInstruction::CreateConvert(ShapeUtil::MakeShape(F32, {}), input));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(), op::Convert(input));
 
   HloConstantFolding const_folder;
-  TF_ASSERT_OK_AND_ASSIGN(bool result, const_folder.Run(module));
+  TF_ASSERT_OK_AND_ASSIGN(bool result, const_folder.Run(module.get()));
   EXPECT_TRUE(result);
 
   EXPECT_THAT(computation->root_instruction(), op::Constant());
@@ -88,13 +88,13 @@ TEST_F(HloConstantFoldingTest, ConvertF32ArrayToS64Array) {
   builder.AddInstruction(
       HloInstruction::CreateConvert(ShapeUtil::MakeShape(S64, {2}), input));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(), op::Convert(input));
 
   HloConstantFolding const_folder;
-  TF_ASSERT_OK_AND_ASSIGN(bool result, const_folder.Run(module));
+  TF_ASSERT_OK_AND_ASSIGN(bool result, const_folder.Run(module.get()));
   EXPECT_TRUE(result);
 
   EXPECT_THAT(computation->root_instruction(), op::Constant());
@@ -130,11 +130,11 @@ TEST_F(HloConstantFoldingTest, Concatenate) {
     Shape shape = ShapeUtil::MakeShape(F32, dimensions);
     builder.AddInstruction(HloInstruction::CreateConcatenate(
         shape, operands, test_config.concat_dimension));
-    auto module = CreateNewModule();
+    auto module = CreateNewVerifiedModule();
     auto computation = module->AddEntryComputation(builder.Build());
 
     HloConstantFolding const_folder;
-    TF_ASSERT_OK_AND_ASSIGN(bool result, const_folder.Run(module));
+    TF_ASSERT_OK_AND_ASSIGN(bool result, const_folder.Run(module.get()));
     EXPECT_TRUE(result);
 
     HloInstruction* root = computation->root_instruction();
@@ -157,11 +157,11 @@ TEST_F(HloConstantFoldingTest, Slice) {
   Shape shape = ShapeUtil::MakeShape(F32, {6, 6, 3, 4, 4});
   builder.AddInstruction(HloInstruction::CreateSlice(
       shape, literal_instruction, slice_start, slice_limits, slice_strides));
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   HloConstantFolding const_folder;
-  TF_ASSERT_OK_AND_ASSIGN(bool result, const_folder.Run(module));
+  TF_ASSERT_OK_AND_ASSIGN(bool result, const_folder.Run(module.get()));
   EXPECT_TRUE(result);
 
   HloInstruction* root = computation->root_instruction();
@@ -182,11 +182,11 @@ TEST_F(HloConstantFoldingTest, TransposeConstantFold) {
   const int64 permutation[] = {1, 2, 0, 4, 3};
   builder.AddInstruction(
       HloInstruction::CreateTranspose(shape, literal_instruction, permutation));
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   HloConstantFolding const_folder;
-  TF_ASSERT_OK_AND_ASSIGN(bool result, const_folder.Run(module));
+  TF_ASSERT_OK_AND_ASSIGN(bool result, const_folder.Run(module.get()));
   EXPECT_TRUE(result);
 
   HloInstruction* root = computation->root_instruction();
@@ -219,27 +219,28 @@ const char* const kConstantFoldReduce = R"(
   })";
 
 TEST_F(HloConstantFoldingTest, ConstantFoldReduce) {
-  ParseAndVerifyModule(kConstantFoldReduce);
+  TF_ASSERT_OK_AND_ASSIGN(auto m,
+                          ParseAndReturnVerifiedModule(kConstantFoldReduce));
   HloConstantFolding const_folder;
-  TF_ASSERT_OK_AND_ASSIGN(bool result, const_folder.Run(&module()));
+  TF_ASSERT_OK_AND_ASSIGN(bool result, const_folder.Run(m.get()));
   EXPECT_TRUE(result);
 
-  EXPECT_EQ(6, module()
-                   .entry_computation()
+  EXPECT_EQ(6, m->entry_computation()
                    ->root_instruction()
                    ->literal()
                    .GetFirstElement<int32>());
 }
 
 TEST_F(HloConstantFoldingTest, ConstantFoldReduceNoLayout) {
-  ParseAndVerifyModule(kConstantFoldReduce);
-  HloInstruction* add = module().computations().begin()->root_instruction();
+  TF_ASSERT_OK_AND_ASSIGN(auto m,
+                          ParseAndReturnVerifiedModule(kConstantFoldReduce));
+  HloInstruction* add = m->computations().begin()->root_instruction();
   LayoutUtil::ClearLayout(add->mutable_shape());
   HloConstantFolding const_folder;
-  TF_ASSERT_OK_AND_ASSIGN(bool result, const_folder.Run(&module()));
+  TF_ASSERT_OK_AND_ASSIGN(bool result, const_folder.Run(m.get()));
   EXPECT_FALSE(result);
 
-  EXPECT_THAT(module().entry_computation()->root_instruction(), op::Reduce());
+  EXPECT_THAT(m->entry_computation()->root_instruction(), op::Reduce());
 }
 
 const char* const kConstantFoldLargePad = R"(
diff --git a/tensorflow/compiler/xla/service/hlo_cost_analysis.cc b/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
index 108aeea097d..fdfb38b858c 100644
--- a/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
+++ b/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
@@ -269,7 +269,7 @@ Status HloCostAnalysis::HandleOutfeed(const HloInstruction*) {
 Status HloCostAnalysis::HandleMap(const HloInstruction* map) {
   // Compute properties of the mapped function.
   TF_ASSIGN_OR_RETURN(const Properties sub_properties,
-                      ProcessSubcomputation(map->to_apply()));
+                      ProcessNestedSubcomputation(map->to_apply()));
 
   // Compute the cost of all elements for this Map operation.
   const int64 element_count = ShapeUtil::ElementsIn(map->shape());
@@ -285,7 +285,7 @@ Status HloCostAnalysis::HandleReduce(const HloInstruction* reduce) {
   HloComputation* function = reduce->to_apply();
   // Compute the cost of the user function.
   TF_ASSIGN_OR_RETURN(const Properties sub_properties,
-                      ProcessSubcomputation(function));
+                      ProcessNestedSubcomputation(function));
 
   // Compute the cost of all elements for this Reduce operation.
   // This counts the number of times the reduction function is applied, so it
@@ -311,7 +311,7 @@ Status HloCostAnalysis::HandleReduceWindow(
   auto function = reduce_window->to_apply();
   // Compute the properties of the reduction function.
   TF_ASSIGN_OR_RETURN(const Properties sub_properties,
-                      ProcessSubcomputation(function));
+                      ProcessNestedSubcomputation(function));
 
   // Compute the cost of all elements for this ReduceWindow operation. For each
   // output element there are window_size - 1 reductions to perform.
@@ -336,9 +336,9 @@ Status HloCostAnalysis::HandleSelectAndScatter(
   // Compute the properties of the select and scatter function.
   // Compute the properties of the reduction function.
   TF_ASSIGN_OR_RETURN(const Properties select_properties,
-                      ProcessSubcomputation(instruction->select()));
+                      ProcessNestedSubcomputation(instruction->select()));
   TF_ASSIGN_OR_RETURN(const Properties scatter_properties,
-                      ProcessSubcomputation(instruction->scatter()));
+                      ProcessNestedSubcomputation(instruction->scatter()));
 
   // Compute the cost of all elements for this operation. For each scatter
   // source element there are window_size - 1 select computations to perform and
@@ -574,7 +574,7 @@ Status HloCostAnalysis::HandleRng(const HloInstruction* random) {
 Status HloCostAnalysis::HandleFusion(const HloInstruction* fusion) {
   TF_ASSIGN_OR_RETURN(
       current_properties_,
-      ProcessSubcomputation(fusion->fused_instructions_computation()));
+      ProcessNestedSubcomputation(fusion->fused_instructions_computation()));
 
   // Fusion nodes that produce a tuple also produce the entries in the tuple.
   // Ignore the memory accessed inside fused ops, since fusion is supposed to
@@ -595,7 +595,7 @@ Status HloCostAnalysis::HandleFusion(const HloInstruction* fusion) {
 
 Status HloCostAnalysis::HandleCall(const HloInstruction* call) {
   TF_ASSIGN_OR_RETURN(current_properties_,
-                      ProcessSubcomputation(call->to_apply()));
+                      ProcessUnnestedSubcomputation(call->to_apply()));
   current_should_compute_bottleneck_time_ = false;
   return Status::OK();
 }
@@ -624,13 +624,12 @@ Status HloCostAnalysis::HandleWhile(const HloInstruction* xla_while) {
   // Since the number of iterations of the while node will not always be
   // something that we can statically analyze, we cannot precisely compute the
   // cost of a while node. For now compute the cost of a single iteration.
-  //
-  // TODO(b/26346211): Improve the cost analysis for while nodes.
   TF_ASSIGN_OR_RETURN(const Properties body_properties,
-                      ProcessSubcomputation(xla_while->while_body()));
+                      ProcessUnnestedSubcomputation(xla_while->while_body()));
 
-  TF_ASSIGN_OR_RETURN(const Properties condition_properties,
-                      ProcessSubcomputation(xla_while->while_condition()));
+  TF_ASSIGN_OR_RETURN(
+      const Properties condition_properties,
+      ProcessUnnestedSubcomputation(xla_while->while_condition()));
 
   current_properties_.clear();
   for (const auto& property : body_properties) {
@@ -647,10 +646,12 @@ Status HloCostAnalysis::HandleWhile(const HloInstruction* xla_while) {
 Status HloCostAnalysis::HandleConditional(const HloInstruction* conditional) {
   // Compute the cost of the true and false computations and take the maximum
   // from those for each property.
-  TF_ASSIGN_OR_RETURN(const Properties true_computation_properties,
-                      ProcessSubcomputation(conditional->true_computation()));
-  TF_ASSIGN_OR_RETURN(const Properties false_computation_properties,
-                      ProcessSubcomputation(conditional->false_computation()));
+  TF_ASSIGN_OR_RETURN(
+      const Properties true_computation_properties,
+      ProcessUnnestedSubcomputation(conditional->true_computation()));
+  TF_ASSIGN_OR_RETURN(
+      const Properties false_computation_properties,
+      ProcessUnnestedSubcomputation(conditional->false_computation()));
   current_properties_ = true_computation_properties;
   for (const auto& property : false_computation_properties) {
     if (!tensorflow::gtl::InsertIfNotPresent(&current_properties_, property)) {
@@ -680,7 +681,7 @@ Status HloCostAnalysis::HandleScatter(const HloInstruction* scatter) {
   const int64 element_count =
       ShapeUtil::ElementsIn(scatter->operand(2)->shape());
   TF_ASSIGN_OR_RETURN(const Properties sub_properties,
-                      ProcessSubcomputation(scatter->to_apply()));
+                      ProcessNestedSubcomputation(scatter->to_apply()));
   for (const auto& property : sub_properties) {
     if (property.first != kBytesAccessedKey) {
       current_properties_[property.first] = property.second * element_count;
@@ -689,6 +690,11 @@ Status HloCostAnalysis::HandleScatter(const HloInstruction* scatter) {
   return Status::OK();
 }
 
+Status HloCostAnalysis::HandleGetDimensionSize(
+    const HloInstruction* /*get_size*/) {
+  return Status::OK();
+}
+
 Status HloCostAnalysis::FinishVisit(const HloInstruction*) {
   return Status::OK();
 }
@@ -725,11 +731,20 @@ float HloCostAnalysis::optimal_seconds(const HloInstruction& hlo) const {
   return GetPropertyForHlo(hlo, kOptimalSecondsKey, hlo_properties_);
 }
 
-StatusOr<HloCostAnalysis::Properties> HloCostAnalysis::ProcessSubcomputation(
-    HloComputation* computation) {
+StatusOr<HloCostAnalysis::Properties>
+HloCostAnalysis::ProcessNestedSubcomputation(HloComputation* computation) {
   HloCostAnalysis visitor(shape_size_, per_second_rates_);
   TF_RETURN_IF_ERROR(computation->Accept(&visitor));
   return visitor.properties();
 }
 
+StatusOr<HloCostAnalysis::Properties>
+HloCostAnalysis::ProcessUnnestedSubcomputation(HloComputation* computation) {
+  HloCostAnalysis visitor(shape_size_, per_second_rates_);
+  TF_RETURN_IF_ERROR(computation->Accept(&visitor));
+  hlo_properties_.insert(visitor.hlo_properties_.begin(),
+                         visitor.hlo_properties_.end());
+  return visitor.properties();
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_cost_analysis.h b/tensorflow/compiler/xla/service/hlo_cost_analysis.h
index 46b4bbeef22..8ced9d776e1 100644
--- a/tensorflow/compiler/xla/service/hlo_cost_analysis.h
+++ b/tensorflow/compiler/xla/service/hlo_cost_analysis.h
@@ -107,6 +107,7 @@ class HloCostAnalysis : public ConstDfsHloVisitor {
   Status HandleConditional(const HloInstruction* conditional) override;
   Status HandleGather(const HloInstruction* gather) override;
   Status HandleScatter(const HloInstruction* scatter) override;
+  Status HandleGetDimensionSize(const HloInstruction* get_size) override;
   Status FinishVisit(const HloInstruction* root) override;
 
   Status Preprocess(const HloInstruction* hlo) override;
@@ -153,7 +154,24 @@ class HloCostAnalysis : public ConstDfsHloVisitor {
 
   // Returns the properties computed from visiting the computation rooted at the
   // given hlo.
-  StatusOr<Properties> ProcessSubcomputation(HloComputation* computation);
+  //
+  // The difference between ProcessNestedSubcomputation and
+  // ProcessUnnestedSubcomputation is that we expect to get profile results for
+  // an unnested subcomputation's individual instructions, while we expect that
+  // a nested subcomputation is completely subsumed by its parent.
+  //
+  // For example, subcomputations inside kFusion and kMap are considered nested,
+  // while subcomputations inside kWhile and kConditional are considered
+  // unnested.
+  //
+  // Another way of thinking of this is, kFusion is implemented on the GPU
+  // backend using just one GPU kernel, while kWhile's body is implemented as a
+  // sequence of kernels, one for each HLO therein.  Backends don't necessarily
+  // need to follow this same implementation strategy, but we assume they do for
+  // the purposes of this platform-generic cost analysis.
+  StatusOr<Properties> ProcessNestedSubcomputation(HloComputation* computation);
+  StatusOr<Properties> ProcessUnnestedSubcomputation(
+      HloComputation* computation);
 
   // Utility function to handle all element-wise operations.
   Status HandleElementwiseOp(const HloInstruction* hlo_instruction);
diff --git a/tensorflow/compiler/xla/service/hlo_cost_analysis_test.cc b/tensorflow/compiler/xla/service/hlo_cost_analysis_test.cc
index 9acee892d59..6a15b3440c6 100644
--- a/tensorflow/compiler/xla/service/hlo_cost_analysis_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_cost_analysis_test.cc
@@ -387,7 +387,7 @@ TEST_F(FusionCostAnalysis, LoopFusion) {
         HloInstruction::CreateBinary(r2f32, HloOpcode::kSubtract, mul, clamp));
     auto tuple = HloInstruction::CreateTuple({sub, sub, mul, c1});
 
-    auto module = CreateNewModule();
+    auto module = CreateNewUnverifiedModule();
     auto* computation = module->AddEntryComputation(builder.Build());
     auto* fusion = computation->CreateFusionInstruction(
         {sub, mul, exp, clamp, add}, HloInstruction::FusionKind::kLoop);
@@ -429,7 +429,7 @@ TEST_F(FusionCostAnalysis, NoLayout) {
   auto add = builder.AddInstruction(HloInstruction::CreateBinary(
       shape_with_layout, HloOpcode::kAdd, c1, broadcast));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   auto* computation = module->AddEntryComputation(builder.Build());
   auto* fusion = computation->CreateFusionInstruction(
       {add, broadcast}, HloInstruction::FusionKind::kLoop);
@@ -472,7 +472,7 @@ TEST_F(DomainCostAnalysis, DomainCost) {
   auto domain = builder.AddInstruction(
       HloInstruction::CreateDomain(tuple->shape(), tuple, nullptr, nullptr));
 
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewUnverifiedModule();
   hlo_module->AddEntryComputation(builder.Build());
 
   EXPECT_EQ(hlo_module->entry_computation()->root_instruction(), domain);
diff --git a/tensorflow/compiler/xla/service/hlo_creation_utils_test.cc b/tensorflow/compiler/xla/service/hlo_creation_utils_test.cc
index e07a196d115..aaa9ec60eb3 100644
--- a/tensorflow/compiler/xla/service/hlo_creation_utils_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_creation_utils_test.cc
@@ -19,22 +19,22 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
-#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace xla {
 namespace {
 
-class HloCreationUtilsTest : public HloVerifiedTestBase {
+class HloCreationUtilsTest : public HloTestBase {
  protected:
-  HloModule* CreateModuleWithProgramShape(
+  std::unique_ptr<VerifiedHloModule> CreateModuleWithProgramShape(
       PrimitiveType primitive_type, absl::Span<const int64> input_shape_dims,
       absl::Span<const int64> output_shape_dims, HloInstruction** param,
       HloComputation** entry_computation) {
     Shape input_shape = ShapeUtil::MakeShape(primitive_type, input_shape_dims);
     Shape output_shape =
         ShapeUtil::MakeShape(primitive_type, output_shape_dims);
-    auto module = CreateNewModule("test");
+    auto module = CreateNewVerifiedModule("test");
     *entry_computation = module->AddEntryComputation(
         CreateComputationWithSignature({&input_shape}, output_shape, "entry")
             .ValueOrDie());
@@ -47,10 +47,9 @@ TEST_F(HloCreationUtilsTest, CollapseFirst1Dim) {
   HloInstruction* param;
   HloComputation* entry_computation;
 
-  HloModule* module = CreateModuleWithProgramShape(S32,
-                                                   /*input_shape_dims=*/{2},
-                                                   /*output_shape_dims=*/{2},
-                                                   &param, &entry_computation);
+  auto module = CreateModuleWithProgramShape(S32, /*input_shape_dims=*/{2},
+                                             /*output_shape_dims=*/{2}, &param,
+                                             &entry_computation);
 
   TF_ASSERT_OK_AND_ASSIGN(HloInstruction * first_1_dims_collapsed,
                           CollapseFirstNDims(param, 1));
@@ -67,9 +66,8 @@ TEST_F(HloCreationUtilsTest, CollapseFirst2Dims) {
   HloInstruction* param;
   HloComputation* entry_computation;
 
-  HloModule* module = CreateModuleWithProgramShape(
-      S32,
-      /*input_shape_dims=*/{2, 3, 2}, /*output_shape_dims=*/{6, 2}, &param,
+  auto module = CreateModuleWithProgramShape(
+      S32, /*input_shape_dims=*/{2, 3, 2}, /*output_shape_dims=*/{6, 2}, &param,
       &entry_computation);
 
   TF_ASSERT_OK_AND_ASSIGN(HloInstruction * first_2_dims_collapsed,
@@ -92,10 +90,9 @@ TEST_F(HloCreationUtilsTest, Prepend1DegenerateDim) {
   HloInstruction* param;
   HloComputation* entry_computation;
 
-  HloModule* module = CreateModuleWithProgramShape(S32,
-                                                   /*input_shape_dims=*/{2},
-                                                   /*output_shape_dims=*/{1, 2},
-                                                   &param, &entry_computation);
+  auto module = CreateModuleWithProgramShape(S32, /*input_shape_dims=*/{2},
+                                             /*output_shape_dims=*/{1, 2},
+                                             &param, &entry_computation);
 
   TF_ASSERT_OK_AND_ASSIGN(HloInstruction * with_1_degenerate_dim_prepended,
                           PrependDegenerateDims(param, 1));
@@ -113,10 +110,9 @@ TEST_F(HloCreationUtilsTest, Prepend2DegenerateDims) {
   HloInstruction* param;
   HloComputation* entry_computation;
 
-  HloModule* module = CreateModuleWithProgramShape(
-      S32,
-      /*input_shape_dims=*/{2}, /*output_shape_dims=*/{1, 1, 2}, &param,
-      &entry_computation);
+  auto module = CreateModuleWithProgramShape(S32, /*input_shape_dims=*/{2},
+                                             /*output_shape_dims=*/{1, 1, 2},
+                                             &param, &entry_computation);
 
   TF_ASSERT_OK_AND_ASSIGN(HloInstruction * with_2_degenerate_dims_prepended,
                           PrependDegenerateDims(param, 2));
@@ -134,10 +130,9 @@ TEST_F(HloCreationUtilsTest, Prepend2DegenerateDimsToScalar) {
   HloInstruction* param;
   HloComputation* entry_computation;
 
-  HloModule* module = CreateModuleWithProgramShape(S32,
-                                                   /*input_shape_dims=*/{},
-                                                   /*output_shape_dims=*/{1, 1},
-                                                   &param, &entry_computation);
+  auto module = CreateModuleWithProgramShape(S32, /*input_shape_dims=*/{},
+                                             /*output_shape_dims=*/{1, 1},
+                                             &param, &entry_computation);
 
   TF_ASSERT_OK_AND_ASSIGN(HloInstruction * with_2_degenerate_dims_prepended,
                           PrependDegenerateDims(param, 2));
@@ -154,10 +149,9 @@ TEST_F(HloCreationUtilsTest, ExpandFirstDimInto3Dims) {
   HloInstruction* param;
   HloComputation* entry_computation;
 
-  HloModule* module = CreateModuleWithProgramShape(
-      S32,
-      /*input_shape_dims=*/{6}, /*output_shape_dims=*/{3, 1, 2}, &param,
-      &entry_computation);
+  auto module = CreateModuleWithProgramShape(S32, /*input_shape_dims=*/{6},
+                                             /*output_shape_dims=*/{3, 1, 2},
+                                             &param, &entry_computation);
 
   TF_ASSERT_OK_AND_ASSIGN(HloInstruction * first_dim_expanded,
                           ExpandFirstDimIntoNDims(param, {3, 1, 2}));
@@ -176,10 +170,9 @@ TEST_F(HloCreationUtilsTest, PadVectorWithZeros) {
   HloInstruction* param;
   HloComputation* entry_computation;
 
-  HloModule* module = CreateModuleWithProgramShape(S32,
-                                                   /*input_shape_dims=*/{2},
-                                                   /*output_shape_dims=*/{6},
-                                                   &param, &entry_computation);
+  auto module = CreateModuleWithProgramShape(S32, /*input_shape_dims=*/{2},
+                                             /*output_shape_dims=*/{6}, &param,
+                                             &entry_computation);
 
   TF_ASSERT_OK_AND_ASSIGN(
       HloInstruction * zero_padded_param,
@@ -197,10 +190,9 @@ TEST_F(HloCreationUtilsTest, BroadcastZeros_S32) {
   HloInstruction* param;
   HloComputation* entry_computation;
 
-  HloModule* module = CreateModuleWithProgramShape(S32,
-                                                   /*input_shape_dims=*/{},
-                                                   /*output_shape_dims=*/{2, 2},
-                                                   &param, &entry_computation);
+  auto module = CreateModuleWithProgramShape(S32, /*input_shape_dims=*/{},
+                                             /*output_shape_dims=*/{2, 2},
+                                             &param, &entry_computation);
 
   TF_ASSERT_OK_AND_ASSIGN(
       HloInstruction * zeros,
@@ -218,10 +210,9 @@ TEST_F(HloCreationUtilsTest, BroadcastZeros_F32) {
   HloInstruction* param;
   HloComputation* entry_computation;
 
-  HloModule* module = CreateModuleWithProgramShape(F32,
-                                                   /*input_shape_dims=*/{},
-                                                   /*output_shape_dims=*/{2, 2},
-                                                   &param, &entry_computation);
+  auto module = CreateModuleWithProgramShape(F32, /*input_shape_dims=*/{},
+                                             /*output_shape_dims=*/{2, 2},
+                                             &param, &entry_computation);
 
   TF_ASSERT_OK_AND_ASSIGN(
       HloInstruction * zeros,
diff --git a/tensorflow/compiler/xla/service/hlo_cse_test.cc b/tensorflow/compiler/xla/service/hlo_cse_test.cc
index 9b18b0284f6..1eb0260468c 100644
--- a/tensorflow/compiler/xla/service/hlo_cse_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_cse_test.cc
@@ -29,7 +29,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/shape_util.h"
-#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/tests/test_utils.h"
 #include "tensorflow/compiler/xla/util.h"
@@ -44,7 +44,7 @@ namespace op = xla::testing::opcode_matchers;
 namespace xla {
 namespace {
 
-class HloCseTest : public HloVerifiedTestBase {
+class HloCseTest : public HloTestBase {
  protected:
   HloCseTest() {}
 };
@@ -59,13 +59,13 @@ TEST_F(HloCseTest, CombineTwoConstants) {
   builder.AddInstruction(HloInstruction::CreateBinary(
       constant1->shape(), HloOpcode::kAdd, constant1, constant2));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   EXPECT_EQ(3, computation->instruction_count());
 
   HloCSE cse(/*is_layout_sensitive=*/false);
-  EXPECT_TRUE(cse.Run(module).ValueOrDie());
+  EXPECT_TRUE(cse.Run(module.get()).ValueOrDie());
 
   EXPECT_EQ(2, computation->instruction_count());
   HloInstruction* constant = *computation->instructions().begin();
@@ -89,14 +89,14 @@ TEST_F(HloCseTest, CombineTwoConstantsDifferentLayoutsAndInsensitive) {
   auto add = builder.AddInstruction(HloInstruction::CreateBinary(
       constant1->shape(), HloOpcode::kAdd, constant1, constant2));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   EXPECT_EQ(3, computation->instruction_count());
   EXPECT_THAT(add, op::Add(constant1, constant2));
 
   HloCSE cse(/*is_layout_sensitive=*/false);
-  EXPECT_TRUE(cse.Run(module).ValueOrDie());
+  EXPECT_TRUE(cse.Run(module.get()).ValueOrDie());
 
   EXPECT_EQ(2, computation->instruction_count());
   auto first_operand = add->operand(0);
@@ -121,14 +121,14 @@ TEST_F(HloCseTest, CombineTwoConstantsDifferentLayoutsAndSensitive) {
   auto add = builder.AddInstruction(HloInstruction::CreateBinary(
       constant1->shape(), HloOpcode::kAdd, constant1, constant2));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   EXPECT_EQ(3, computation->instruction_count());
   EXPECT_THAT(add, op::Add(constant1, constant2));
 
   HloCSE cse(/*is_layout_sensitive=*/true);
-  EXPECT_FALSE(cse.Run(module).ValueOrDie());
+  EXPECT_FALSE(cse.Run(module.get()).ValueOrDie());
 
   EXPECT_EQ(3, computation->instruction_count());
   EXPECT_THAT(add, op::Add(constant1, constant2));
@@ -171,13 +171,13 @@ TEST_F(HloCseTest, ConstantsSameValueDifferentType) {
         shape_r0, HloOpcode::kAdd, root, constants[i]));
   }
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   EXPECT_EQ(20, computation->instruction_count());
 
   HloCSE cse(/*is_layout_sensitive=*/false);
-  EXPECT_TRUE(cse.Run(module).ValueOrDie());
+  EXPECT_TRUE(cse.Run(module.get()).ValueOrDie());
 
   // CSE will remove both the second float(42.0f) and the corresponding
   // convert/cast.
@@ -201,7 +201,7 @@ TEST_F(HloCseTest, NonscalarConstants) {
   auto tuple = builder.AddInstruction(HloInstruction::CreateTuple(
       {common_constant1, common_constant2, uncommon_constant}));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   EXPECT_EQ(4, computation->instruction_count());
@@ -209,7 +209,7 @@ TEST_F(HloCseTest, NonscalarConstants) {
               op::Tuple(common_constant1, common_constant2, uncommon_constant));
 
   HloCSE cse(/*is_layout_sensitive=*/false);
-  EXPECT_TRUE(cse.Run(module).ValueOrDie());
+  EXPECT_TRUE(cse.Run(module.get()).ValueOrDie());
 
   EXPECT_EQ(3, computation->instruction_count());
   auto first_operand = tuple->operand(0);
@@ -233,14 +233,14 @@ TEST_F(HloCseTest, IdenticalInstructions) {
   auto tuple =
       builder.AddInstruction(HloInstruction::CreateTuple({exp1, exp2, exp3}));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   EXPECT_EQ(5, computation->instruction_count());
   EXPECT_THAT(tuple, op::Tuple(exp1, exp2, exp3));
 
   HloCSE cse(/*is_layout_sensitive=*/true);
-  EXPECT_TRUE(cse.Run(module).ValueOrDie());
+  EXPECT_TRUE(cse.Run(module.get()).ValueOrDie());
 
   EXPECT_EQ(3, computation->instruction_count());
   auto first_operand = tuple->operand(0);
@@ -250,7 +250,7 @@ TEST_F(HloCseTest, IdenticalInstructions) {
 
 // Test two identical while loops with same inputs
 TEST_F(HloCseTest, WhileLoopsIdenticalConditionsAndBodiesSameInput) {
-  ParseAndVerifyModule(R"(
+  const char* const hlo_string = R"(
     HloModule WhileLoopsIdenticalConditionsAndBodiesSameInput
 
     %body (param: (f32[], f32[])) -> (f32[], f32[]) {
@@ -277,21 +277,21 @@ index=1 %add = f32[] add(f32[] %get-tuple-element, f32[] %get-tuple-element.1)
 f32[]) while((f32[], f32[]) %tuple.1), condition=%condition, body=%body ROOT
 %while.1 = (f32[], f32[]) while((f32[], f32[]) %tuple.1),
 condition=%condition.1, body=%body
-    }
-    )");
+    })";
 
-  auto computation = module().entry_computation();
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(hlo_string));
+  auto computation = m->entry_computation();
 
   EXPECT_EQ(5, computation->instruction_count());
   HloCSE cse(true);
-  EXPECT_TRUE(cse.Run(&module()).ValueOrDie());
+  EXPECT_TRUE(cse.Run(m.get()).ValueOrDie());
   EXPECT_EQ(4, computation->instruction_count());
 }
 
 // Test two while loops with same conditions, same inputs, but different
 // bodies
 TEST_F(HloCseTest, WhileLoopsIdenticalConditionsSameInputAndDifferentBodies) {
-  ParseAndVerifyModule(R"(
+  const char* const hlo_string = R"(
     HloModule WhileLoopsIdenticalConditionsSameInputAndDifferentBodies
 
     %body (param: (f32[], f32[])) -> (f32[], f32[]) {
@@ -327,20 +327,20 @@ index=1 %sub = f32[] subtract(f32[] %get-tuple-element.2, f32[]
       %while = (f32[], f32[]) while((f32[], f32[]) %tuple.1),
 condition=%condition, body=%body ROOT %while.1 = (f32[], f32[]) while((f32[],
 f32[]) %tuple.1), condition=%condition.1, body=%body2
-    }
-    )");
+    })";
 
-  auto computation = module().entry_computation();
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(hlo_string));
+  auto computation = m->entry_computation();
 
   EXPECT_EQ(5, computation->instruction_count());
   HloCSE cse(true);
-  EXPECT_FALSE(cse.Run(&module()).ValueOrDie());
+  EXPECT_FALSE(cse.Run(m.get()).ValueOrDie());
   EXPECT_EQ(5, computation->instruction_count());
 }
 
 // Test two identical while loops with different inputs
 TEST_F(HloCseTest, WhileLoopsIdenticalConditionsAndBodiesDifferentInput) {
-  ParseAndVerifyModule(R"(
+  const char* const hlo_string = R"(
     HloModule WhileLoopsIdenticalConditionsAndBodiesDifferentInput
 
     %body (param: (f32[], f32[])) -> (f32[], f32[]) {
@@ -369,22 +369,21 @@ condition=%condition, body=%body %constant.4 = f32[] constant(1) %constant.5 =
 f32[] constant(2) %tuple.2 = (f32[], f32[]) tuple(f32[] %constant.4, f32[]
 %constant.5) ROOT %while.1 = (f32[], f32[]) while((f32[], f32[]) %tuple.2),
 condition=%condition.1, body=%body
-    }
+    })";
 
-    )");
-
-  auto computation = module().entry_computation();
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(hlo_string));
+  auto computation = m->entry_computation();
 
   EXPECT_EQ(8, computation->instruction_count());
   HloCSE cse(true);
-  EXPECT_FALSE(cse.Run(&module()).ValueOrDie());
+  EXPECT_FALSE(cse.Run(m.get()).ValueOrDie());
   EXPECT_EQ(8, computation->instruction_count());
 }
 
 // Test two while loops with identical bodies and same inputs, but different
 // conditions
 TEST_F(HloCseTest, WhileLoopsIdenticalBodiesAndInputDifferntConditions) {
-  ParseAndVerifyModule(R"(
+  const char* const hlo_string = R"(
     HloModule WhileLoopsIdenticalBodiesAndInputDifferntConditions
 
     %body (param: (f32[], f32[])) -> (f32[], f32[]) {
@@ -411,13 +410,14 @@ f32[]) { %constant.2 = f32[] constant(1) %constant.3 = f32[] constant(2)
       %while = (f32[], f32[]) while((f32[], f32[]) %tuple.1),
 condition=%condition, body=%body ROOT %while.1 = (f32[], f32[]) while((f32[],
 f32[]) %tuple.1), condition=%condition.1, body=%body
-    })");
+    })";
 
-  auto computation = module().entry_computation();
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(hlo_string));
+  auto computation = m->entry_computation();
 
   EXPECT_EQ(5, computation->instruction_count());
   HloCSE cse(true);
-  EXPECT_FALSE(cse.Run(&module()).ValueOrDie());
+  EXPECT_FALSE(cse.Run(m.get()).ValueOrDie());
   EXPECT_EQ(5, computation->instruction_count());
 }
 
@@ -439,14 +439,14 @@ TEST_F(HloCseTest, IdenticalInstructionsDifferentLayoutsSensitive) {
   auto tuple =
       builder.AddInstruction(HloInstruction::CreateTuple({exp1, exp2}));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   EXPECT_EQ(4, computation->instruction_count());
   EXPECT_THAT(tuple, op::Tuple(exp1, exp2));
 
   HloCSE cse(/*is_layout_sensitive=*/true);
-  EXPECT_FALSE(cse.Run(module).ValueOrDie());
+  EXPECT_FALSE(cse.Run(module.get()).ValueOrDie());
 
   EXPECT_EQ(4, computation->instruction_count());
   EXPECT_THAT(tuple, op::Tuple(exp1, exp2));
@@ -470,14 +470,14 @@ TEST_F(HloCseTest, IdenticalInstructionsDifferentLayoutsInsensitive) {
   auto tuple =
       builder.AddInstruction(HloInstruction::CreateTuple({exp1, exp2}));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   EXPECT_EQ(4, computation->instruction_count());
   EXPECT_THAT(tuple, op::Tuple(exp1, exp2));
 
   HloCSE cse(/*is_layout_sensitive=*/false);
-  EXPECT_TRUE(cse.Run(module).ValueOrDie());
+  EXPECT_TRUE(cse.Run(module.get()).ValueOrDie());
 
   EXPECT_EQ(3, computation->instruction_count());
   auto first_operand = tuple->operand(0);
@@ -488,7 +488,7 @@ TEST_F(HloCseTest, IdenticalInstructionsDifferentLayoutsInsensitive) {
 TEST_F(HloCseTest, FusionInternalCSE) {
   // Test that we can CSE expressions that live within a fusion node
   // computation.
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto builder = HloComputation::Builder(TestName());
 
   const Shape shape_r0 = ShapeUtil::MakeShape(F32, {});
@@ -512,7 +512,7 @@ TEST_F(HloCseTest, FusionInternalCSE) {
 
   EXPECT_EQ(5, fused_computation->instruction_count());
   HloCSE cse(/*is_layout_sensitive=*/false);
-  EXPECT_TRUE(cse.Run(module).ValueOrDie());
+  EXPECT_TRUE(cse.Run(module.get()).ValueOrDie());
   EXPECT_EQ(4, fused_computation->instruction_count());
 
   auto root = fused_computation->root_instruction();
@@ -554,14 +554,14 @@ TEST_F(HloCseTest, IdenticalExpressions) {
   auto tuple =
       builder.AddInstruction(HloInstruction::CreateTuple({add1, add2}));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   EXPECT_EQ(8, computation->instruction_count());
   EXPECT_THAT(tuple, op::Tuple(op::Add(negate1, exp1), op::Add(negate2, exp2)));
 
   HloCSE cse(/*is_layout_sensitive=*/false);
-  EXPECT_TRUE(cse.Run(module).ValueOrDie());
+  EXPECT_TRUE(cse.Run(module.get()).ValueOrDie());
 
   EXPECT_EQ(5, computation->instruction_count());
   auto operand = tuple->operand(0);
@@ -586,7 +586,7 @@ TEST_F(HloCseTest, DoNotCombineRng) {
   builder.AddInstruction(HloInstruction::CreateBinary(
       constant1->shape(), HloOpcode::kAdd, rng1, rng2));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   HloInstruction* root = computation->root_instruction();
@@ -595,7 +595,7 @@ TEST_F(HloCseTest, DoNotCombineRng) {
   uint32 count_before = computation->instruction_count();
 
   HloCSE cse(/*is_layout_sensitive=*/false);
-  EXPECT_FALSE(cse.Run(module).ValueOrDie());
+  EXPECT_FALSE(cse.Run(module.get()).ValueOrDie());
 
   uint32 count_after = computation->instruction_count();
   EXPECT_EQ(count_before, count_after);
@@ -607,7 +607,7 @@ TEST_F(HloCseTest, DoNotCombineCallsToImpureFunctions) {
   // Test that two calls to an impure function are not commoned. RNG
   // is the source of the impurity.
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
 
   // rng_function is an impure function because it does RNG.
   HloComputation* rng_function = nullptr;
@@ -649,7 +649,7 @@ TEST_F(HloCseTest, DoNotCombineCallsToImpureFunctions) {
   VLOG(3) << "before: " << module->ToString();
 
   HloCSE cse(/*is_layout_sensitive=*/false);
-  EXPECT_FALSE(cse.Run(module).ValueOrDie());
+  EXPECT_FALSE(cse.Run(module.get()).ValueOrDie());
 
   VLOG(3) << "after: " << module->ToString();
 
@@ -659,7 +659,7 @@ TEST_F(HloCseTest, DoNotCombineCallsToImpureFunctions) {
 }
 
 TEST_F(HloCseTest, CompareComputations) {
-  ParseAndVerifyModule(R"(
+  const char* const hlo_string = R"(
     HloModule m
 
     add_computation {
@@ -680,11 +680,12 @@ TEST_F(HloCseTest, CompareComputations) {
       r1 = f32[] reduce(p, c), dimensions={0}, to_apply=add_computation
       r2 = f32[] reduce(p, c), dimensions={0}, to_apply=add_computation2
       ROOT f2 = (f32[],f32[]) tuple(r1, r2)
-    })");
+    })";
 
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(hlo_string));
   HloCSE cse(/*is_layout_sensitive=*/false);
-  EXPECT_TRUE(cse.Run(&module()).ValueOrDie());
-  HloInstruction* root = module().entry_computation()->root_instruction();
+  EXPECT_TRUE(cse.Run(m.get()).ValueOrDie());
+  HloInstruction* root = m->entry_computation()->root_instruction();
   EXPECT_EQ(root->operand(0), root->operand(1));
 }
 
@@ -697,19 +698,19 @@ TEST_F(HloCseTest, ConstantsSameValueInDifferentDomains) {
   builder.AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::CreateR0<uint32>(42)));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   EXPECT_EQ(2, computation->instruction_count());
 
   HloCSE cse(/*is_layout_sensitive=*/false);
-  EXPECT_FALSE(cse.Run(module).ValueOrDie());
+  EXPECT_FALSE(cse.Run(module.get()).ValueOrDie());
 
   EXPECT_EQ(2, computation->instruction_count());
 }
 
 TEST_F(HloCseTest, Domain) {
-  ParseAndVerifyModule(R"(
+  const char* const hlo_string = R"(
 HloModule module
 ENTRY %entry {
   %param = f32[] parameter(0), sharding={maximal device=0}
@@ -730,11 +731,12 @@ ENTRY %entry {
     domain={kind="sharding", entry={maximal device=2}, exit={maximal device=0}}
   %add = f32[] add(%domain.3, %domain.4)
   ROOT %sub = f32[] subtract(%add, %domain.5)
-})");
+})";
 
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(hlo_string));
   HloCSE cse(/*is_layout_sensitive=*/false);
-  EXPECT_TRUE(cse.Run(&module()).ValueOrDie());
-  const HloInstruction* sub = module().entry_computation()->root_instruction();
+  EXPECT_TRUE(cse.Run(m.get()).ValueOrDie());
+  const HloInstruction* sub = m->entry_computation()->root_instruction();
   const HloInstruction* add = sub->operand(0);
   EXPECT_EQ(add->operand(0), add->operand(1));
   EXPECT_NE(add->operand(0), sub->operand(1));
diff --git a/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc b/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc
index 909853106d5..6422346c101 100644
--- a/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc
@@ -43,7 +43,7 @@ using ::testing::UnorderedElementsAre;
 class HloDataflowAnalysisTest : public HloTestBase,
                                 public ::testing::WithParamInterface<bool> {
  protected:
-  HloDataflowAnalysisTest() : module_(CreateNewModule()) {}
+  HloDataflowAnalysisTest() : module_(CreateNewUnverifiedModule()) {}
 
   // Run dataflow analysis on the member module. For convenience returns a
   // reference to the generated analysis stored in analysis_.
@@ -1884,7 +1884,7 @@ INSTANTIATE_TEST_CASE_P(HloDataflowAnalysisInstantiation,
 class HloDataflowAnalysisTestBase : public HloTestBase {
  protected:
   void BuildModule(std::unique_ptr<HloComputation> computation) {
-    module_ = CreateNewModule();
+    module_ = CreateNewUnverifiedModule();
     computation_ = module_->AddEntryComputation(std::move(computation));
   }
 
@@ -2476,7 +2476,7 @@ TEST_F(CanShareOperandBufferWithUserTest, WhileCanShare) {
     return builder.Build();
   };
 
-  module_ = CreateNewModule();
+  module_ = CreateNewUnverifiedModule();
   HloComputation* cond_computation =
       module_->AddEmbeddedComputation(make_cond());
   HloComputation* body_computation =
@@ -2511,7 +2511,7 @@ TEST_F(CanShareOperandBufferWithUserTest, CallToComputationWithFusionRoot) {
   auto add = sub_builder.AddInstruction(
       HloInstruction::CreateBinary(shape, HloOpcode::kAdd, sub_param, ones));
 
-  module_ = CreateNewModule();
+  module_ = CreateNewUnverifiedModule();
   auto sub_computation = module_->AddEmbeddedComputation(sub_builder.Build());
   sub_computation->CreateFusionInstruction({add, ones},
                                            HloInstruction::FusionKind::kLoop);
diff --git a/tensorflow/compiler/xla/service/hlo_dce_test.cc b/tensorflow/compiler/xla/service/hlo_dce_test.cc
index 3b5cde2996c..6c8095d3977 100644
--- a/tensorflow/compiler/xla/service/hlo_dce_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_dce_test.cc
@@ -59,7 +59,7 @@ TEST_F(HloDceTest, NoDeadCode) {
   builder.AddInstruction(HloInstruction::CreateBinary(
       constant1->shape(), HloOpcode::kAdd, constant1, constant2));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   EXPECT_EQ(3, computation->instruction_count());
@@ -80,7 +80,7 @@ TEST_F(HloDceTest, InstructionsWithSideEffect) {
       HloInstruction::CreateSend(constant, token, /*channel_id=*/0));
   builder.AddInstruction(HloInstruction::CreateTuple({}));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   EXPECT_EQ(4, computation->instruction_count());
@@ -110,7 +110,7 @@ TEST_F(HloDceTest, DeadParameters) {
   builder.AddInstruction(HloInstruction::CreateUnary(
       live_param->shape(), HloOpcode::kNegate, live_param));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   EXPECT_EQ(5, computation->instruction_count());
@@ -150,7 +150,7 @@ TEST_F(HloDceTest, ControlDependencies) {
   builder.AddInstruction(HloInstruction::CreateBinary(
       constant1->shape(), HloOpcode::kAdd, constant1, constant2));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   // Add a control dependency between two instructions.
@@ -175,7 +175,7 @@ TEST_F(HloDceTest, ControlDependencies) {
 
 // Tests that a dead call instruction is removed.
 TEST_F(HloDceTest, DeadInstructionWithCalledComputation) {
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   Shape shape = ShapeUtil::MakeShape(F32, {});
 
   // Called computation for the call instruction.
@@ -215,7 +215,7 @@ TEST_F(HloDceTest, DeadInstructionWithCalledComputation) {
 // Tests that a while instruction with an infeed (effectul instruction) in its
 // body is not removed, even its user count is 0.
 TEST_F(HloDceTest, CalledComputationWithSideEffect) {
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   Shape shape = ShapeUtil::MakeShape(F32, {});
 
   // Condition computation of a while instruction.
@@ -270,7 +270,7 @@ TEST_F(HloDceTest, CalledComputationWithSideEffect) {
 
 // Tests that a nested call instruction with a side effect is not removed.
 TEST_F(HloDceTest, CalledComputationWithNestedSideEffect) {
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   Shape shape = ShapeUtil::MakeShape(F32, {});
 
   // Nested called computation with a side effect.
@@ -323,7 +323,7 @@ TEST_F(HloDceTest, CalledComputationWithNestedSideEffect) {
 }
 
 TEST_F(HloDceTest, RemoveDeadSubcomputation) {
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   HloComputation::Builder builder(TestName());
 
   HloComputation::Builder subcomp_builder("reduction_subcomp");
@@ -364,7 +364,7 @@ TEST_F(HloDceTest, RemoveDeadSubcomputation) {
 }
 
 TEST_F(HloDceTest, KeepUsedSubcomputation) {
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   HloComputation::Builder builder(TestName());
 
   HloComputation::Builder subcomp_builder("reduction_subcomp");
diff --git a/tensorflow/compiler/xla/service/hlo_domain_test.cc b/tensorflow/compiler/xla/service/hlo_domain_test.cc
index b90e8db2339..acdb42128e3 100644
--- a/tensorflow/compiler/xla/service/hlo_domain_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_domain_test.cc
@@ -14,7 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "absl/memory/memory.h"
-#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
+#include "tensorflow/compiler/xla/debug_options_flags.h"
 #include "tensorflow/compiler/xla/service/hlo_domain_isolator.h"
 #include "tensorflow/compiler/xla/service/hlo_domain_metadata.h"
 #include "tensorflow/compiler/xla/service/hlo_domain_remover.h"
@@ -22,13 +22,12 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_sharding_metadata.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
-#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 
 namespace xla {
 namespace {
 
-class HloDomainTest : public HloVerifiedTestBase {
+class HloDomainTest : public HloTestBase {
  protected:
   bool FindUserViaDomainPath(HloInstruction* instruction,
                              HloInstruction* operand) const {
@@ -64,13 +63,6 @@ class HloDomainTest : public HloVerifiedTestBase {
     }
     return false;
   }
-
-  StatusOr<HloModule*> ParseModule(absl::string_view hlo_string) {
-    HloModuleConfig config;
-    config.set_debug_options(legacy_flags::GetDebugOptionsFromFlags());
-    ParseAndVerifyModule(hlo_string, config);
-    return &module();
-  }
 };
 
 // Dummy DomainMetadata implementation which create kDomain boundaries around
@@ -144,31 +136,32 @@ ENTRY entry {
 }
 )";
 
-  TF_ASSERT_OK_AND_ASSIGN(HloModule * module, ParseModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
   LOG(INFO) << "Original module:\n" << module->ToString();
 
   HloDomainIsolator isolator([]() { return ShardingDomainCreator{}; });
-  TF_ASSERT_OK_AND_ASSIGN(bool isolator_changed, isolator.Run(module));
+  TF_ASSERT_OK_AND_ASSIGN(bool isolator_changed, isolator.Run(module.get()));
   EXPECT_TRUE(isolator_changed);
 
-  EXPECT_TRUE(HasDomainEdge(module, "c", "a"));
-  EXPECT_TRUE(HasDomainEdge(module, "c", "b"));
-  EXPECT_TRUE(HasDomainEdge(module, "d", "a"));
-  EXPECT_TRUE(HasDomainEdge(module, "d", "b"));
-  EXPECT_FALSE(HasDomainEdge(module, "e", "c"));
-  EXPECT_FALSE(HasDomainEdge(module, "e", "d"));
+  EXPECT_TRUE(HasDomainEdge(module.get(), "c", "a"));
+  EXPECT_TRUE(HasDomainEdge(module.get(), "c", "b"));
+  EXPECT_TRUE(HasDomainEdge(module.get(), "d", "a"));
+  EXPECT_TRUE(HasDomainEdge(module.get(), "d", "b"));
+  EXPECT_FALSE(HasDomainEdge(module.get(), "e", "c"));
+  EXPECT_FALSE(HasDomainEdge(module.get(), "e", "d"));
 
   HloDomainRemover remover(ShardingMetadata::KindName(),
                            ShardingMetadata::NormalizeShardingDomain);
-  TF_ASSERT_OK_AND_ASSIGN(bool remover_changed, remover.Run(module));
+  TF_ASSERT_OK_AND_ASSIGN(bool remover_changed, remover.Run(module.get()));
   EXPECT_TRUE(remover_changed);
 
-  EXPECT_FALSE(HasDomainEdge(module, "c", "a"));
-  EXPECT_FALSE(HasDomainEdge(module, "c", "b"));
-  EXPECT_FALSE(HasDomainEdge(module, "d", "a"));
-  EXPECT_FALSE(HasDomainEdge(module, "d", "b"));
-  EXPECT_FALSE(HasDomainEdge(module, "e", "c"));
-  EXPECT_FALSE(HasDomainEdge(module, "e", "d"));
+  EXPECT_FALSE(HasDomainEdge(module.get(), "c", "a"));
+  EXPECT_FALSE(HasDomainEdge(module.get(), "c", "b"));
+  EXPECT_FALSE(HasDomainEdge(module.get(), "d", "a"));
+  EXPECT_FALSE(HasDomainEdge(module.get(), "d", "b"));
+  EXPECT_FALSE(HasDomainEdge(module.get(), "e", "c"));
+  EXPECT_FALSE(HasDomainEdge(module.get(), "e", "d"));
 }
 
 TEST_F(HloDomainTest, CheckNoDomainAddedIfNoSharding) {
@@ -186,11 +179,12 @@ ENTRY entry {
 }
 )";
 
-  TF_ASSERT_OK_AND_ASSIGN(HloModule * module, ParseModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
   LOG(INFO) << "Original module:\n" << module->ToString();
 
   HloDomainIsolator isolator([]() { return ShardingDomainCreator{}; });
-  TF_ASSERT_OK_AND_ASSIGN(bool isolator_changed, isolator.Run(module));
+  TF_ASSERT_OK_AND_ASSIGN(bool isolator_changed, isolator.Run(module.get()));
   EXPECT_TRUE(!isolator_changed);
 }
 
@@ -213,26 +207,27 @@ ENTRY entry {
 }
 )";
 
-  TF_ASSERT_OK_AND_ASSIGN(HloModule * module, ParseModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
   LOG(INFO) << "Original module:\n" << module->ToString();
 
   HloDomainIsolator isolator([]() { return ShardingDomainCreator{}; });
-  TF_ASSERT_OK_AND_ASSIGN(bool isolator_changed, isolator.Run(module));
+  TF_ASSERT_OK_AND_ASSIGN(bool isolator_changed, isolator.Run(module.get()));
   EXPECT_TRUE(isolator_changed);
 
-  EXPECT_TRUE(HasDomainEdge(module, "b", "a"));
-  EXPECT_TRUE(HasDomainEdge(module, "f", "e_element"));
-  EXPECT_FALSE(HasDomainEdge(module, "a", "p0"));
-  EXPECT_FALSE(HasDomainEdge(module, "c", "b"));
-  EXPECT_FALSE(HasDomainEdge(module, "e", "d"));
+  EXPECT_TRUE(HasDomainEdge(module.get(), "b", "a"));
+  EXPECT_TRUE(HasDomainEdge(module.get(), "f", "e_element"));
+  EXPECT_FALSE(HasDomainEdge(module.get(), "a", "p0"));
+  EXPECT_FALSE(HasDomainEdge(module.get(), "c", "b"));
+  EXPECT_FALSE(HasDomainEdge(module.get(), "e", "d"));
 
   HloDomainRemover remover(ShardingMetadata::KindName(),
                            ShardingMetadata::NormalizeShardingDomain);
-  TF_ASSERT_OK_AND_ASSIGN(bool remover_changed, remover.Run(module));
+  TF_ASSERT_OK_AND_ASSIGN(bool remover_changed, remover.Run(module.get()));
   EXPECT_TRUE(remover_changed);
 
-  EXPECT_FALSE(HasDomainEdge(module, "b", "a"));
-  EXPECT_FALSE(HasDomainEdge(module, "f", "e_element"));
+  EXPECT_FALSE(HasDomainEdge(module.get(), "b", "a"));
+  EXPECT_FALSE(HasDomainEdge(module.get(), "f", "e_element"));
 }
 
 TEST_F(HloDomainTest, CheckNoDomainAddedOnPureIOComputation) {
@@ -250,11 +245,12 @@ ENTRY entry {
 }
 )";
 
-  TF_ASSERT_OK_AND_ASSIGN(HloModule * module, ParseModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
   LOG(INFO) << "Original module:\n" << module->ToString();
 
   HloDomainIsolator isolator([]() { return ShardingDomainCreator{}; });
-  TF_ASSERT_OK_AND_ASSIGN(bool isolator_changed, isolator.Run(module));
+  TF_ASSERT_OK_AND_ASSIGN(bool isolator_changed, isolator.Run(module.get()));
   EXPECT_FALSE(isolator_changed);
 }
 
@@ -273,15 +269,16 @@ ENTRY entry {
 }
 )";
 
-  TF_ASSERT_OK_AND_ASSIGN(HloModule * module, ParseModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
   LOG(INFO) << "Original module:\n" << module->ToString();
 
   HloDomainRemover remover(ShardingMetadata::KindName(),
                            ShardingMetadata::NormalizeShardingDomain);
-  TF_ASSERT_OK_AND_ASSIGN(bool remover_changed, remover.Run(module));
+  TF_ASSERT_OK_AND_ASSIGN(bool remover_changed, remover.Run(module.get()));
   EXPECT_FALSE(remover_changed);
 
-  HloInstruction* add = FindInstruction(module, "c");
+  HloInstruction* add = FindInstruction(module.get(), "c");
   ASSERT_NE(add, nullptr);
   auto device = add->sharding_unique_device();
   EXPECT_TRUE(device.has_value());
@@ -304,41 +301,42 @@ ENTRY entry {
 }
 )";
 
-  TF_ASSERT_OK_AND_ASSIGN(HloModule * module, ParseModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
   LOG(INFO) << "Original module:\n" << module->ToString();
 
   HloDomainIsolator sharding_isolator([]() { return ShardingDomainCreator{}; });
   TF_ASSERT_OK_AND_ASSIGN(bool sharding_isolator_changed,
-                          sharding_isolator.Run(module));
+                          sharding_isolator.Run(module.get()));
   EXPECT_TRUE(sharding_isolator_changed);
 
   HloDomainIsolator opname_isolator([]() { return OpNameDomainCreator{}; });
   TF_ASSERT_OK_AND_ASSIGN(bool opname_isolator_changed,
-                          opname_isolator.Run(module));
+                          opname_isolator.Run(module.get()));
   EXPECT_TRUE(opname_isolator_changed);
 
-  EXPECT_TRUE(HasDomainEdge(module, "c", "a"));
-  EXPECT_TRUE(HasDomainEdge(module, "c", "b"));
-  EXPECT_TRUE(HasDomainEdge(module, "d", "a"));
-  EXPECT_TRUE(HasDomainEdge(module, "d", "c"));
-  EXPECT_FALSE(HasDomainEdge(module, "e", "d"));
+  EXPECT_TRUE(HasDomainEdge(module.get(), "c", "a"));
+  EXPECT_TRUE(HasDomainEdge(module.get(), "c", "b"));
+  EXPECT_TRUE(HasDomainEdge(module.get(), "d", "a"));
+  EXPECT_TRUE(HasDomainEdge(module.get(), "d", "c"));
+  EXPECT_FALSE(HasDomainEdge(module.get(), "e", "d"));
 
   HloDomainRemover sharding_remover(ShardingMetadata::KindName(),
                                     ShardingMetadata::NormalizeShardingDomain);
   TF_ASSERT_OK_AND_ASSIGN(bool sharding_remover_changed,
-                          sharding_remover.Run(module));
+                          sharding_remover.Run(module.get()));
   EXPECT_TRUE(sharding_remover_changed);
 
   HloDomainRemover opname_remover(OpNameMetadata::KindName(),
                                   OpNameDomainNormalizer);
   TF_ASSERT_OK_AND_ASSIGN(bool opname_remover_changed,
-                          opname_remover.Run(module));
+                          opname_remover.Run(module.get()));
   EXPECT_TRUE(opname_remover_changed);
 
-  EXPECT_FALSE(HasDomainEdge(module, "c", "a"));
-  EXPECT_FALSE(HasDomainEdge(module, "c", "b"));
-  EXPECT_FALSE(HasDomainEdge(module, "d", "a"));
-  EXPECT_FALSE(HasDomainEdge(module, "d", "c"));
+  EXPECT_FALSE(HasDomainEdge(module.get(), "c", "a"));
+  EXPECT_FALSE(HasDomainEdge(module.get(), "c", "b"));
+  EXPECT_FALSE(HasDomainEdge(module.get(), "d", "a"));
+  EXPECT_FALSE(HasDomainEdge(module.get(), "d", "c"));
 }
 
 TEST_F(HloDomainTest, CheckNormalizationOnInfeedTuple) {
@@ -359,16 +357,17 @@ ENTRY entry {
 }
 )";
 
-  TF_ASSERT_OK_AND_ASSIGN(HloModule * module, ParseModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
   LOG(INFO) << "Original module:\n" << module->ToString();
 
   HloDomainIsolator isolator([]() { return ShardingDomainCreator{}; });
-  TF_ASSERT_OK_AND_ASSIGN(bool isolator_changed, isolator.Run(module));
+  TF_ASSERT_OK_AND_ASSIGN(bool isolator_changed, isolator.Run(module.get()));
   EXPECT_TRUE(isolator_changed);
 
-  EXPECT_TRUE(HasDomainEdge(module, "infeed.data", "infeed"));
-  EXPECT_FALSE(HasDomainEdge(module, "copy0", "gte0"));
-  EXPECT_FALSE(HasDomainEdge(module, "copy1", "gte1"));
+  EXPECT_TRUE(HasDomainEdge(module.get(), "infeed.data", "infeed"));
+  EXPECT_FALSE(HasDomainEdge(module.get(), "copy0", "gte0"));
+  EXPECT_FALSE(HasDomainEdge(module.get(), "copy1", "gte1"));
 
   // Inject unassigned tuple/gte within the infeed domain, to simulate the
   // HLO passes adding unexpected instructions.
@@ -384,7 +383,7 @@ ENTRY entry {
   //           \       /
   //             TUPLE
   //               |
-  HloInstruction* infeed_data = FindInstruction(module, "infeed.data");
+  HloInstruction* infeed_data = FindInstruction(module.get(), "infeed.data");
   ASSERT_NE(infeed_data, nullptr);
 
   auto infeed_data_users = infeed_data->users();
@@ -410,7 +409,7 @@ ENTRY entry {
 
   HloDomainRemover remover(ShardingMetadata::KindName(),
                            ShardingMetadata::NormalizeShardingDomain);
-  TF_ASSERT_OK_AND_ASSIGN(bool remover_changed, remover.Run(module));
+  TF_ASSERT_OK_AND_ASSIGN(bool remover_changed, remover.Run(module.get()));
   EXPECT_TRUE(remover_changed);
 
   struct Assignment {
@@ -446,25 +445,26 @@ ENTRY entry {
     sharding={maximal device=1}
 })";
 
-  TF_ASSERT_OK_AND_ASSIGN(HloModule * module, ParseModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
 
   HloDomainIsolator isolator([]() { return ShardingDomainCreator{}; });
-  TF_ASSERT_OK_AND_ASSIGN(bool isolator_changed, isolator.Run(module));
+  TF_ASSERT_OK_AND_ASSIGN(bool isolator_changed, isolator.Run(module.get()));
   EXPECT_TRUE(isolator_changed);
 
-  EXPECT_TRUE(HasDomainEdge(module, "tuple", "param"));
-  EXPECT_FALSE(HasDomainEdge(module, "gte", "tuple"));
+  EXPECT_TRUE(HasDomainEdge(module.get(), "tuple", "param"));
+  EXPECT_FALSE(HasDomainEdge(module.get(), "gte", "tuple"));
 
   // Remove %tuple and %gte (tuple simplification)
-  HloInstruction* gte = FindInstruction(module, "gte");
-  HloInstruction* tuple = FindInstruction(module, "tuple");
+  HloInstruction* gte = FindInstruction(module.get(), "gte");
+  HloInstruction* tuple = FindInstruction(module.get(), "tuple");
   module->entry_computation()->set_root_instruction(tuple->mutable_operand(0));
   TF_EXPECT_OK(module->entry_computation()->RemoveInstruction(gte));
   TF_EXPECT_OK(module->entry_computation()->RemoveInstruction(tuple));
 
   HloDomainRemover remover(ShardingMetadata::KindName(),
                            ShardingMetadata::NormalizeShardingDomain);
-  TF_ASSERT_OK_AND_ASSIGN(bool remover_changed, remover.Run(module));
+  TF_ASSERT_OK_AND_ASSIGN(bool remover_changed, remover.Run(module.get()));
   EXPECT_TRUE(remover_changed);
 
   const HloInstruction* root = module->entry_computation()->root_instruction();
@@ -486,11 +486,11 @@ TEST_F(HloDomainTest, DumpParseNullSharding) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(shape, HloOpcode::kAdd, domain, domain));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   auto hlo_string = module->ToString();
-  ASSERT_TRUE(ParseModule(hlo_string).status().ok());
+  ASSERT_TRUE(ParseAndReturnVerifiedModule(hlo_string).status().ok());
 }
 
 // Tuple inputs are domain instructions.
@@ -507,20 +507,21 @@ ENTRY entry {
 }
 )";
 
-  TF_ASSERT_OK_AND_ASSIGN(HloModule * module, ParseModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
 
   HloDomainIsolator isolator([]() { return ShardingDomainCreator{}; });
-  TF_ASSERT_OK_AND_ASSIGN(bool isolator_changed, isolator.Run(module));
+  TF_ASSERT_OK_AND_ASSIGN(bool isolator_changed, isolator.Run(module.get()));
   EXPECT_TRUE(isolator_changed);
 
   // Clear sharding of tpl instruction, in order to test domain sharding
   // application.
-  auto tpl = FindInstruction(module, "tpl");
+  auto tpl = FindInstruction(module.get(), "tpl");
   tpl->clear_sharding();
 
   HloDomainRemover remover(ShardingMetadata::KindName(),
                            ShardingMetadata::NormalizeShardingDomain);
-  TF_ASSERT_OK_AND_ASSIGN(bool remover_changed, remover.Run(module));
+  TF_ASSERT_OK_AND_ASSIGN(bool remover_changed, remover.Run(module.get()));
   EXPECT_TRUE(remover_changed);
 
   EXPECT_EQ(HloSharding::Tuple(tpl->shape(), {HloSharding::AssignDevice(1),
@@ -555,36 +556,37 @@ ENTRY %entry (p0: (f32[4], f32[4])) -> (f32[4], f32[4], f32[4]) {
   ROOT %g = (f32[4], f32[4], f32[4]) tuple(%domain.2, %domain.3, %domain.4)
 })";
 
-  TF_ASSERT_OK_AND_ASSIGN(HloModule * module, ParseModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
   LOG(INFO) << "Original module:\n" << module->ToString();
 
   HloDomainIsolator opname_isolator([]() { return OpNameDomainCreator{}; });
   TF_ASSERT_OK_AND_ASSIGN(bool opname_isolator_changed,
-                          opname_isolator.Run(module));
+                          opname_isolator.Run(module.get()));
   EXPECT_TRUE(opname_isolator_changed);
 
-  EXPECT_TRUE(HasDomainEdge(module, "c", "a"));
-  EXPECT_TRUE(HasDomainEdge(module, "c", "b"));
-  EXPECT_TRUE(HasDomainEdge(module, "d", "a"));
-  EXPECT_TRUE(HasDomainEdge(module, "d", "c"));
-  EXPECT_FALSE(HasDomainEdge(module, "e", "d"));
+  EXPECT_TRUE(HasDomainEdge(module.get(), "c", "a"));
+  EXPECT_TRUE(HasDomainEdge(module.get(), "c", "b"));
+  EXPECT_TRUE(HasDomainEdge(module.get(), "d", "a"));
+  EXPECT_TRUE(HasDomainEdge(module.get(), "d", "c"));
+  EXPECT_FALSE(HasDomainEdge(module.get(), "e", "d"));
 
   HloDomainRemover sharding_remover(ShardingMetadata::KindName(),
                                     ShardingMetadata::NormalizeShardingDomain);
   TF_ASSERT_OK_AND_ASSIGN(bool sharding_remover_changed,
-                          sharding_remover.Run(module));
+                          sharding_remover.Run(module.get()));
   EXPECT_TRUE(sharding_remover_changed);
 
   HloDomainRemover opname_remover(OpNameMetadata::KindName(),
                                   OpNameDomainNormalizer);
   TF_ASSERT_OK_AND_ASSIGN(bool opname_remover_changed,
-                          opname_remover.Run(module));
+                          opname_remover.Run(module.get()));
   EXPECT_TRUE(opname_remover_changed);
 
-  EXPECT_FALSE(HasDomainEdge(module, "c", "a"));
-  EXPECT_FALSE(HasDomainEdge(module, "c", "b"));
-  EXPECT_FALSE(HasDomainEdge(module, "d", "a"));
-  EXPECT_FALSE(HasDomainEdge(module, "d", "c"));
+  EXPECT_FALSE(HasDomainEdge(module.get(), "c", "a"));
+  EXPECT_FALSE(HasDomainEdge(module.get(), "c", "b"));
+  EXPECT_FALSE(HasDomainEdge(module.get(), "d", "a"));
+  EXPECT_FALSE(HasDomainEdge(module.get(), "d", "c"));
 }
 
 // Emulate instructions inserted at top and bottom within nested tuple domain.
@@ -603,15 +605,16 @@ ENTRY entry {
 }
 )";
 
-  TF_ASSERT_OK_AND_ASSIGN(HloModule * module, ParseModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
 
   HloDomainIsolator isolator([]() { return ShardingDomainCreator{}; });
-  TF_ASSERT_OK_AND_ASSIGN(bool isolator_changed, isolator.Run(module));
+  TF_ASSERT_OK_AND_ASSIGN(bool isolator_changed, isolator.Run(module.get()));
   EXPECT_TRUE(isolator_changed);
 
   // Clear sharding of tuple.0 instruction, in order to test domain sharding
   // application.
-  auto tuple0 = FindInstruction(module, "tuple.0");
+  auto tuple0 = FindInstruction(module.get(), "tuple.0");
   tuple0->clear_sharding();
 
   // Insert the following instructons above and below tuple.0, to emulate other
@@ -655,7 +658,7 @@ ENTRY entry {
 
   HloDomainRemover remover(ShardingMetadata::KindName(),
                            ShardingMetadata::NormalizeShardingDomain);
-  TF_ASSERT_OK_AND_ASSIGN(bool remover_changed, remover.Run(module));
+  TF_ASSERT_OK_AND_ASSIGN(bool remover_changed, remover.Run(module.get()));
   EXPECT_TRUE(remover_changed);
 
   EXPECT_TRUE(tuple0->has_sharding());
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator_test.cc b/tensorflow/compiler/xla/service/hlo_evaluator_test.cc
index 608a42bb607..d95b6ad04f2 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_evaluator_test.cc
@@ -33,7 +33,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/test.h"
-#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
@@ -50,9 +50,9 @@ namespace {
 static std::array<bool, 2> use_bf16_params{true, false};
 
 class HloEvaluatorTest : public ::testing::WithParamInterface<bool>,
-                         public HloVerifiedTestBase {
+                         public HloTestBase {
  protected:
-  HloEvaluatorTest() : HloVerifiedTestBase(), use_bfloat16_(GetParam()) {
+  HloEvaluatorTest() : HloTestBase(), use_bfloat16_(GetParam()) {
     evaluator_ = absl::make_unique<HloEvaluator>();
   }
 
@@ -60,14 +60,14 @@ class HloEvaluatorTest : public ::testing::WithParamInterface<bool>,
     if (use_bfloat16_) {
       // In BF16 mode, we convert all F32 type to BF16 and evaluate the module.
       auto type_converter = HloElementTypeConverter(F32, BF16);
-      type_converter.Run(&module()).ValueOrDie();
+      type_converter.Run(m_.get()).ValueOrDie();
     }
-    return evaluator_->Evaluate(*module().entry_computation(), arg_literals)
+    return evaluator_->Evaluate(*m_->entry_computation(), arg_literals)
         .ConsumeValueOrDie();
   }
 
-  // Evaluate function that takes in a local module instead of using module_
-  // that is in HloVerifiedTestBase. Once module_ in HloVerifiedTestBase is
+  // Evaluate function that takes in a local module instead of using m_
+  // that is in HloTestBase. Once m_ in HloTestBase is
   // removed, this should be the default Evaluate function.
   Literal EvaluateWithModule(
       HloModule* module, absl::Span<const Literal* const> arg_literals = {}) {
@@ -88,7 +88,7 @@ class HloEvaluatorTest : public ::testing::WithParamInterface<bool>,
     auto c1 =
         b.AddInstruction(HloInstruction::CreateConstant(std::move(input)));
     b.AddInstruction(HloInstruction::CreateUnary(expected.shape(), opcode, c1));
-    module().AddEntryComputation(b.Build());
+    m_->AddEntryComputation(b.Build());
 
     Literal result = Evaluate();
 
@@ -108,7 +108,7 @@ class HloEvaluatorTest : public ::testing::WithParamInterface<bool>,
     auto c2 = b.AddInstruction(HloInstruction::CreateConstant(std::move(rhs)));
     b.AddInstruction(
         HloInstruction::CreateBinary(expected.shape(), opcode, c1, c2));
-    module().AddEntryComputation(b.Build());
+    m_->AddEntryComputation(b.Build());
 
     Literal result = Evaluate();
 
@@ -116,6 +116,7 @@ class HloEvaluatorTest : public ::testing::WithParamInterface<bool>,
   }
 
   bool use_bfloat16_;
+  std::unique_ptr<HloModule> m_ = CreateNewVerifiedModule();
 };
 
 #define XLA_TYPED_TEST_P(test_case_name, test_name, test_type1) \
@@ -135,7 +136,7 @@ TEST_P(HloEvaluatorTest, DoesClamp) {
   auto c3 = b.AddInstruction(HloInstruction::CreateConstant(std::move(high)));
   b.AddInstruction(
       HloInstruction::CreateTernary(shape, HloOpcode::kClamp, c1, c2, c3));
-  module().AddEntryComputation(b.Build());
+  m_->AddEntryComputation(b.Build());
 
   Literal result = Evaluate();
 
@@ -156,7 +157,7 @@ TEST_P(HloEvaluatorTest, DISABLED_DoesClampSpecialBroadcast) {
   auto c3 = b.AddInstruction(HloInstruction::CreateConstant(std::move(high)));
   b.AddInstruction(
       HloInstruction::CreateTernary(shape, HloOpcode::kClamp, c1, c2, c3));
-  module().AddEntryComputation(b.Build());
+  m_->AddEntryComputation(b.Build());
 
   Literal result = Evaluate();
 
@@ -181,7 +182,7 @@ TEST_P(HloEvaluatorTest, DoesSelect) {
       b.AddInstruction(HloInstruction::CreateConstant(std::move(on_false)));
   b.AddInstruction(
       HloInstruction::CreateTernary(shape, HloOpcode::kSelect, c1, c2, c3));
-  module().AddEntryComputation(b.Build());
+  m_->AddEntryComputation(b.Build());
 
   Literal result = Evaluate({});
 
@@ -322,7 +323,7 @@ TEST_P(HloEvaluatorTest, DoesTraverseInstructions) {
       b.AddInstruction(HloInstruction::CreateParameter(2, shape, "rhs2"));
   b.AddInstruction(HloInstruction::CreateBinary(shape, HloOpcode::kAdd,
                                                 lhs_instruction, param_rhs2));
-  module().AddEntryComputation(b.Build());
+  m_->AddEntryComputation(b.Build());
 
   Literal result = Evaluate(args);
 
@@ -346,7 +347,7 @@ TEST_P(HloEvaluatorTest, DoesReshape) {
   const int64 permutation[] = {1, 2, 0, 4, 3};
   b.AddInstruction(
       HloInstruction::CreateTranspose(shape, literal_instruction, permutation));
-  module().AddEntryComputation(b.Build());
+  m_->AddEntryComputation(b.Build());
 
   Literal result = Evaluate({});
 
@@ -367,7 +368,7 @@ TEST_P(HloEvaluatorTest, DoesBroadcast) {
       HloInstruction::CreateConstant(std::move(input_literal)));
   b.AddInstruction(HloInstruction::CreateBroadcast(
       output_literal.shape(), literal_instruction, {1, 2}));
-  module().AddEntryComputation(b.Build());
+  m_->AddEntryComputation(b.Build());
 
   Literal result = Evaluate({});
 
@@ -386,7 +387,7 @@ TEST_P(HloEvaluatorTest, DoesBroadcastScalar) {
   b.AddInstruction(HloInstruction::CreateBroadcast(
       output_literal.shape(), literal_instruction,
       /*broadcast_dimensions=*/{}));
-  module().AddEntryComputation(b.Build());
+  m_->AddEntryComputation(b.Build());
 
   Literal result = Evaluate({});
 
@@ -406,7 +407,7 @@ TEST_P(HloEvaluatorTest, DoesConcatenateSimple) {
   Shape shape = ShapeUtil::MakeShape(S64, {4, 2});
   b.AddInstruction(HloInstruction::CreateConcatenate(shape, operands, 0));
 
-  module().AddEntryComputation(b.Build());
+  m_->AddEntryComputation(b.Build());
 
   Literal result = Evaluate();
 
@@ -428,7 +429,7 @@ TEST_P(HloEvaluatorTest, ConcatenateHandlesShapeWithZeroElement) {
   Shape shape = ShapeUtil::MakeShape(S64, {2});
   b.AddInstruction(HloInstruction::CreateConcatenate(shape, operands, 0));
 
-  module().AddEntryComputation(b.Build());
+  m_->AddEntryComputation(b.Build());
 
   Literal result = Evaluate();
 
@@ -448,7 +449,7 @@ TEST_P(HloEvaluatorTest, ConvertWithSameLayout) {
   HloInstruction* constant = b.AddInstruction(
       HloInstruction::CreateConstant(std::move(input_literal)));
   b.AddInstruction(HloInstruction::CreateConvert(expected.shape(), constant));
-  module().AddEntryComputation(b.Build());
+  m_->AddEntryComputation(b.Build());
 
   Literal result = Evaluate();
 
@@ -468,7 +469,7 @@ TEST_P(HloEvaluatorTest, ConvertWithDifferentLayout) {
   HloInstruction* constant = b.AddInstruction(
       HloInstruction::CreateConstant(std::move(input_literal)));
   b.AddInstruction(HloInstruction::CreateConvert(expected.shape(), constant));
-  module().AddEntryComputation(b.Build());
+  m_->AddEntryComputation(b.Build());
 
   Literal result = Evaluate();
 
@@ -503,7 +504,7 @@ TEST_P(HloEvaluatorTest, Pad2DIntegerArrayWithZeroDimension) {
   Shape shape = ShapeUtil::MakeShape(S32, {5, 2});
   b.AddInstruction(HloInstruction::CreatePad(
       shape, operand_instruction, padding_value_instruction, padding_config));
-  module().AddEntryComputation(b.Build());
+  m_->AddEntryComputation(b.Build());
 
   Literal result = Evaluate();
 
@@ -530,7 +531,7 @@ TEST_P(HloEvaluatorTest, Pad4DFloatArrayWithInteriorPadding) {
       CreatePaddingConfig({{{1, 0, 2}}, {{0, 2, 1}}, {{0, 0, 0}}, {{0, 0, 0}}});
   b.AddInstruction(HloInstruction::CreatePad(
       shape, input_instruction, pad_instruction, r4_padding_on_dim0_dim1));
-  module().AddEntryComputation(b.Build());
+  m_->AddEntryComputation(b.Build());
 
   Literal result = Evaluate();
 
@@ -574,7 +575,7 @@ TEST_P(HloEvaluatorTest, NegativePadding2D) {
                                              pad_value_instruction,
                                              r2_padding_on_dim0_dim1));
 
-  module().AddEntryComputation(b.Build());
+  m_->AddEntryComputation(b.Build());
 
   Literal result = Evaluate();
 
@@ -619,7 +620,7 @@ TEST_P(HloEvaluatorTest, NegativeAndInteriorPadding2D) {
                                              pad_value_instruction,
                                              r2_padding_on_dim0_dim1));
 
-  module().AddEntryComputation(b.Build());
+  m_->AddEntryComputation(b.Build());
 
   Literal result = Evaluate();
 
@@ -658,7 +659,7 @@ TEST_P(HloEvaluatorTest, DotRank2AndRank1) {
   b.AddInstruction(HloInstruction::CreateDot(shape, lhs_instruction,
                                              rhs_instruction, dot_dnums,
                                              DefaultPrecisionConfig(2)));
-  module().AddEntryComputation(b.Build());
+  m_->AddEntryComputation(b.Build());
 
   Literal result = Evaluate();
 
@@ -704,7 +705,7 @@ TEST_P(HloEvaluatorTest, DotRank1AndRank2) {
   b.AddInstruction(HloInstruction::CreateDot(shape, lhs_instruction,
                                              rhs_instruction, dot_dnums,
                                              DefaultPrecisionConfig(2)));
-  module().AddEntryComputation(b.Build());
+  m_->AddEntryComputation(b.Build());
 
   Literal result = Evaluate();
 
@@ -748,7 +749,7 @@ TEST_P(HloEvaluatorTest, DotRank2AndRank2) {
   b.AddInstruction(HloInstruction::CreateDot(shape, lhs_instruction,
                                              rhs_instruction, dot_dnums,
                                              DefaultPrecisionConfig(2)));
-  module().AddEntryComputation(b.Build());
+  m_->AddEntryComputation(b.Build());
 
   Literal result = Evaluate();
 
@@ -802,7 +803,7 @@ TEST_P(HloEvaluatorTest, SimpleConv1D) {
   b.AddInstruction(HloInstruction::CreateConvolve(
       shape, lhs_instruction, rhs_instruction, /*feature_group_count=*/1,
       window, dnums, DefaultPrecisionConfig(2)));
-  module().AddEntryComputation(b.Build());
+  m_->AddEntryComputation(b.Build());
 
   Literal result = Evaluate();
 
@@ -857,7 +858,7 @@ TEST_P(HloEvaluatorTest, Simple4x4Conv2DWith2x2Kernel) {
   b.AddInstruction(HloInstruction::CreateConvolve(
       shape, lhs_instruction, rhs_instruction, /*feature_group_count=*/1,
       window, dnums, DefaultPrecisionConfig(2)));
-  module().AddEntryComputation(b.Build());
+  m_->AddEntryComputation(b.Build());
 
   Literal result = Evaluate();
 
@@ -941,7 +942,7 @@ TEST_P(HloEvaluatorTest, Conv2DGeneralDimensionsReversed) {
   b.AddInstruction(HloInstruction::CreateConvolve(
       shape, lhs_instruction, rhs_instruction, /*feature_group_count=*/1,
       window, dnums, DefaultPrecisionConfig(2)));
-  module().AddEntryComputation(b.Build());
+  m_->AddEntryComputation(b.Build());
 
   Literal result = Evaluate();
 
@@ -1019,7 +1020,7 @@ TEST_P(HloEvaluatorTest, Conv2DGeneralDimensions) {
   b.AddInstruction(HloInstruction::CreateConvolve(
       shape, lhs_instruction, rhs_instruction, /*feature_group_count=*/1,
       window, dnums, DefaultPrecisionConfig(2)));
-  module().AddEntryComputation(b.Build());
+  m_->AddEntryComputation(b.Build());
 
   Literal result = Evaluate();
 
@@ -1079,7 +1080,7 @@ TEST_P(HloEvaluatorTest, DilatedBaseConv2DWithHighPadding) {
   b.AddInstruction(HloInstruction::CreateConvolve(
       shape, lhs_instruction, rhs_instruction, /*feature_group_count=*/1,
       window, dnums, DefaultPrecisionConfig(2)));
-  module().AddEntryComputation(b.Build());
+  m_->AddEntryComputation(b.Build());
 
   Literal result = Evaluate();
 
@@ -1143,7 +1144,7 @@ TEST_P(HloEvaluatorTest, DilatedBaseConv2DWithLowAndHighPadding) {
   b.AddInstruction(HloInstruction::CreateConvolve(
       shape, lhs_instruction, rhs_instruction, /*feature_group_count=*/1,
       window, dnums, DefaultPrecisionConfig(2)));
-  module().AddEntryComputation(b.Build());
+  m_->AddEntryComputation(b.Build());
 
   Literal result = Evaluate();
 
@@ -1215,7 +1216,7 @@ TEST_P(HloEvaluatorTest,
   b.AddInstruction(HloInstruction::CreateConvolve(
       shape, lhs_instruction, rhs_instruction, /*feature_group_count=*/1,
       window, dnums, DefaultPrecisionConfig(2)));
-  module().AddEntryComputation(b.Build());
+  m_->AddEntryComputation(b.Build());
 
   Literal result = Evaluate();
 
@@ -1286,7 +1287,7 @@ TEST_P(HloEvaluatorTest, Conv2DGroupedConvolution) {
   b.AddInstruction(HloInstruction::CreateConvolve(
       shape, lhs_instruction, rhs_instruction,
       /*feature_group_count=*/2, window, dnums, DefaultPrecisionConfig(2)));
-  module().AddEntryComputation(b.Build());
+  m_->AddEntryComputation(b.Build());
 
   Literal result = Evaluate();
 
@@ -1297,11 +1298,12 @@ TEST_P(HloEvaluatorTest, Conv2DGroupedConvolution) {
   EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
 }
 
-class HloEvaluatorPreciseReduceTest : public HloVerifiedTestBase {};
+class HloEvaluatorPreciseReduceTest : public HloTestBase {};
 
 // Tests that Reduce doesn't lose precision when adding many numbers (because
 // it accumulates its result in a double).
 TEST_F(HloEvaluatorPreciseReduceTest, AddReductionPrecisionTest) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder b(TestName());
 
   constexpr int kNumElements = 1 << 25;  // float += 1 saturates at 1<<24
@@ -1319,12 +1321,12 @@ TEST_F(HloEvaluatorPreciseReduceTest, AddReductionPrecisionTest) {
       HloInstruction::CreateParameter(1, scalar_shape, "rhs"));
   add_computation.AddInstruction(HloInstruction::CreateBinary(
       scalar_shape, HloOpcode::kAdd, param_lhs, param_rhs));
-  auto add_func = module().AddEmbeddedComputation(add_computation.Build());
+  auto add_func = m->AddEmbeddedComputation(add_computation.Build());
 
   HloInstruction* reduce_instruction = b.AddInstruction(
       HloInstruction::CreateReduce(scalar_shape, arg_instruction, init_value,
                                    /*dimensions_to_reduce=*/{0}, add_func));
-  module().AddEntryComputation(b.Build());
+  m->AddEntryComputation(b.Build());
 
   HloEvaluator hlo_eval;
   Literal result = hlo_eval.Evaluate(reduce_instruction).ConsumeValueOrDie();
@@ -1337,7 +1339,7 @@ void BM_ReducePrecisely(int num_iters) {
   tensorflow::testing::StopTiming();
   HloComputation::Builder b("BM_ReducePrecisely");
   HloModuleConfig config;
-  config.set_debug_options(legacy_flags::GetDebugOptionsFromFlags());
+  config.set_debug_options(GetDebugOptionsFromFlags());
   HloModule module("BM_ReducePrecisely", config);
 
   constexpr int kNumElements = 1 << 25;  // float += 1 saturates at 1<<24
@@ -1396,14 +1398,14 @@ TEST_P(HloEvaluatorTest, ReduceAdd) {
       HloInstruction::CreateParameter(1, scalar_shape, "rhs"));
   add_computation.AddInstruction(HloInstruction::CreateBinary(
       scalar_shape, HloOpcode::kAdd, param_lhs, param_rhs));
-  auto add_func = module().AddEmbeddedComputation(add_computation.Build());
+  auto add_func = m_->AddEmbeddedComputation(add_computation.Build());
 
   Shape shape = ShapeUtil::MakeShape(F32, {2});
   b.AddInstruction(
       HloInstruction::CreateReduce(shape, arg_instruction, init_value,
                                    /*dimensions_to_reduce=*/{1}, add_func));
 
-  module().AddEntryComputation(b.Build());
+  m_->AddEntryComputation(b.Build());
 
   Literal result = Evaluate();
 
@@ -1438,7 +1440,7 @@ TEST_P(HloEvaluatorTest, ReduceWindowMax) {
       HloInstruction::CreateParameter(1, scalar_shape, "rhs"));
   max_computation.AddInstruction(HloInstruction::CreateBinary(
       scalar_shape, HloOpcode::kMaximum, param_lhs, param_rhs));
-  auto max_func = module().AddEmbeddedComputation(max_computation.Build());
+  auto max_func = m_->AddEmbeddedComputation(max_computation.Build());
 
   Window window;
   WindowDimension dim;
@@ -1455,7 +1457,7 @@ TEST_P(HloEvaluatorTest, ReduceWindowMax) {
   b.AddInstruction(HloInstruction::CreateReduceWindow(
       shape, arg_instruction, init_value, window, max_func));
 
-  module().AddEntryComputation(b.Build());
+  m_->AddEntryComputation(b.Build());
 
   Literal result = Evaluate();
 
@@ -1490,7 +1492,7 @@ TEST_P(HloEvaluatorTest, ReduceWindowMaxWindowDilation) {
       HloInstruction::CreateParameter(1, scalar_shape, "rhs"));
   max_computation.AddInstruction(HloInstruction::CreateBinary(
       scalar_shape, HloOpcode::kMaximum, param_lhs, param_rhs));
-  auto max_func = module().AddEmbeddedComputation(max_computation.Build());
+  auto max_func = m_->AddEmbeddedComputation(max_computation.Build());
 
   Window window;
   WindowDimension dim;
@@ -1507,7 +1509,7 @@ TEST_P(HloEvaluatorTest, ReduceWindowMaxWindowDilation) {
   b.AddInstruction(HloInstruction::CreateReduceWindow(
       shape, arg_instruction, init_value, window, max_func));
 
-  module().AddEntryComputation(b.Build());
+  m_->AddEntryComputation(b.Build());
 
   Literal result = Evaluate();
 
@@ -1541,7 +1543,7 @@ TEST_P(HloEvaluatorTest, ReduceWindowAdd) {
       HloInstruction::CreateParameter(1, scalar_shape, "rhs"));
   add_computation.AddInstruction(HloInstruction::CreateBinary(
       scalar_shape, HloOpcode::kAdd, param_lhs, param_rhs));
-  auto add_func = module().AddEmbeddedComputation(add_computation.Build());
+  auto add_func = m_->AddEmbeddedComputation(add_computation.Build());
 
   Window window;
   WindowDimension dim;
@@ -1564,7 +1566,7 @@ TEST_P(HloEvaluatorTest, ReduceWindowAdd) {
   b.AddInstruction(HloInstruction::CreateReduceWindow(
       shape, arg_instruction, init_value, window, add_func));
 
-  module().AddEntryComputation(b.Build());
+  m_->AddEntryComputation(b.Build());
 
   Literal result = Evaluate();
 
@@ -1594,7 +1596,7 @@ TEST_P(HloEvaluatorTest, ReduceWindowAdd6D) {
       HloInstruction::CreateParameter(1, scalar_shape, "rhs"));
   add_computation.AddInstruction(HloInstruction::CreateBinary(
       scalar_shape, HloOpcode::kAdd, param_lhs, param_rhs));
-  auto add_func = module().AddEmbeddedComputation(add_computation.Build());
+  auto add_func = m_->AddEmbeddedComputation(add_computation.Build());
 
   Window window;
 
@@ -1625,7 +1627,7 @@ TEST_P(HloEvaluatorTest, ReduceWindowAdd6D) {
   b.AddInstruction(HloInstruction::CreateReduceWindow(
       shape, arg_instruction, init_value, window, add_func));
 
-  module().AddEntryComputation(b.Build());
+  m_->AddEntryComputation(b.Build());
 
   Literal result = Evaluate();
 
@@ -1657,7 +1659,7 @@ TEST_P(HloEvaluatorTest, StridedSlice) {
                                                /*start_indices=*/{0, 2},
                                                /*limit_indices=*/{3, 5},
                                                /*strides=*/{2, 3}));
-  module().AddEntryComputation(b.Build());
+  m_->AddEntryComputation(b.Build());
 
   Literal result = Evaluate();
 
@@ -1691,7 +1693,7 @@ TEST_P(HloEvaluatorTest, DynamicSlice) {
   Shape shape = ShapeUtil::MakeShape(F32, {2, 3});
   b.AddInstruction(HloInstruction::CreateDynamicSlice(shape, operand,
                                                       start_indices, {2, 3}));
-  module().AddEntryComputation(b.Build());
+  m_->AddEntryComputation(b.Build());
 
   Literal result = Evaluate();
 
@@ -1727,7 +1729,7 @@ TEST_P(HloEvaluatorTest, DynamicSliceModSlice) {
   Shape shape = ShapeUtil::MakeShape(F32, {2, 3});
   b.AddInstruction(HloInstruction::CreateDynamicSlice(shape, operand,
                                                       start_indices, {2, 3}));
-  module().AddEntryComputation(b.Build());
+  m_->AddEntryComputation(b.Build());
 
   Literal result = Evaluate();
 
@@ -1764,7 +1766,7 @@ TEST_P(HloEvaluatorTest, DynamicSliceUpdate) {
   Shape shape = ShapeUtil::MakeShape(F64, {2, 3});
   b.AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
       shape, operand, update, start_indices));
-  module().AddEntryComputation(b.Build());
+  m_->AddEntryComputation(b.Build());
 
   Literal result = Evaluate();
 
@@ -1800,7 +1802,7 @@ TEST_P(HloEvaluatorTest, SetAndGetTuples) {
   Shape shape = ShapeUtil::MakeShape(F64, {2, 3});
   b.AddInstruction(HloInstruction::CreateGetTupleElement(shape, tuple, 1));
 
-  module().AddEntryComputation(b.Build());
+  m_->AddEntryComputation(b.Build());
 
   Literal result = Evaluate();
 
@@ -1839,7 +1841,7 @@ TEST_P(HloEvaluatorTest, SetAndGetNestedTuples) {
   b.AddInstruction(
       HloInstruction::CreateGetTupleElement(tuple2->shape(), outer_tuple, 1));
 
-  module().AddEntryComputation(b.Build());
+  m_->AddEntryComputation(b.Build());
 
   Literal result = Evaluate();
 
@@ -1877,7 +1879,7 @@ TEST_P(HloEvaluatorTest, Reverse) {
 
   const Shape shape = ShapeUtil::MakeShape(F32, {4, 3, 2, 1});
   b.AddInstruction(HloInstruction::CreateReverse(shape, operand, {0, 1}));
-  module().AddEntryComputation(b.Build());
+  m_->AddEntryComputation(b.Build());
 
   Literal result = Evaluate();
 
@@ -1966,7 +1968,7 @@ ENTRY main {
       slice_sizes={1, 3}
 }
 )";
-  ParseAndVerifyModule(hlo_text);
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
   Literal operand =
       LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
   Literal start_indices = LiteralUtil::CreateR1<int32>({0, 2});
@@ -1990,7 +1992,7 @@ ENTRY main {
       slice_sizes={3, 1}
 }
 )";
-  ParseAndVerifyModule(hlo_text);
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
   Literal operand =
       LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
   Literal start_indices = LiteralUtil::CreateR1<int32>({0, 2});
@@ -2014,7 +2016,7 @@ ENTRY main {
       slice_sizes={3, 1}
 }
 )";
-  ParseAndVerifyModule(hlo_text);
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
   Literal operand =
       LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
   Literal start_indices = LiteralUtil::CreateR2<int32>({{0, 2}, {2, 1}});
@@ -2039,7 +2041,7 @@ ENTRY main {
       slice_sizes={1,1,2}
 }
 )";
-  ParseAndVerifyModule(hlo_text);
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
   Literal operand =
       LiteralUtil::CreateR3<int32>({{{-1, 1}, {-2, 2}, {-3, 3}},  //
                                     {{-4, 4}, {-5, 5}, {-6, 6}},  //
@@ -2066,7 +2068,7 @@ ENTRY main {
       slice_sizes={1,1,2}
 }
 )";
-  ParseAndVerifyModule(hlo_text);
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
   Literal operand =
       LiteralUtil::CreateR3<int32>({{{-1, 1}, {-2, 2}, {-3, 3}},  //
                                     {{-4, 4}, {-5, 5}, {-6, 6}},  //
@@ -2092,7 +2094,7 @@ ENTRY main {
       slice_sizes={1,1}
 }
 )";
-  ParseAndVerifyModule(hlo_text);
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
   Literal operand =
       LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
   Literal start_indices = LiteralUtil::CreateR1<int32>({1, 1});
@@ -2115,7 +2117,7 @@ ENTRY main {
       slice_sizes={1,1}
 }
 )";
-  ParseAndVerifyModule(hlo_text);
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
   Literal operand =
       LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
   Literal start_indices = LiteralUtil::CreateR2<int32>({{2, 1}, {1, 1}});
@@ -2139,7 +2141,7 @@ ENTRY main {
       slice_sizes={1, 0}
 }
 )";
-  ParseAndVerifyModule(hlo_text);
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
   Literal operand = LiteralUtil::CreateR2<int32>({{}, {}, {}});
   Literal start_indices = LiteralUtil::CreateR1<int32>({0, 2});
   EXPECT_TRUE(LiteralTestUtil::Equal(LiteralUtil::CreateR2<int32>({{}, {}}),
@@ -2161,7 +2163,7 @@ ENTRY main {
       slice_sizes={1}
 }
 )";
-  ParseAndVerifyModule(hlo_text);
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
 
   Literal operand = LiteralUtil::CreateR1<int32>({0, 1, 2});
   Literal start_indices =
@@ -2192,7 +2194,7 @@ ENTRY main {
       index_vector_dim=1
 }
 )";
-  ParseAndVerifyModule(hlo_text);
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
   Literal operand =
       LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
   Literal scatter_indices = LiteralUtil::CreateR1<int32>({0, 2});
@@ -2223,7 +2225,7 @@ ENTRY main {
       index_vector_dim=1
 }
 )";
-  ParseAndVerifyModule(hlo_text);
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
   Literal operand =
       LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
   Literal scatter_indices = LiteralUtil::CreateR1<int32>({0, 2});
@@ -2256,7 +2258,7 @@ ENTRY main {
       index_vector_dim=1
 }
 )";
-  ParseAndVerifyModule(hlo_text);
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
   Literal operand =
       LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
   Literal scatter_indices = LiteralUtil::CreateR1<int32>({0, 2});
@@ -2288,7 +2290,7 @@ ENTRY main {
       index_vector_dim=1
 }
 )";
-  ParseAndVerifyModule(hlo_text);
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
   Literal operand =
       LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
   Literal scatter_indices = LiteralUtil::CreateR1<int32>({0, 2});
@@ -2320,7 +2322,7 @@ ENTRY main {
       index_vector_dim=1
 }
 )";
-  ParseAndVerifyModule(hlo_text);
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
   Literal operand = LiteralUtil::CreateR2<float>(
       {{1.1, 2.2, 3.3}, {4.4, 5.5, 6.6}, {7.7, 8.8, 9.9}});
   Literal scatter_indices = LiteralUtil::CreateR1<int32>({2, 1});
@@ -2354,7 +2356,7 @@ ENTRY main {
       index_vector_dim=1
 }
 )";
-  ParseAndVerifyModule(hlo_text);
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
   Literal operand =
       LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
   Literal scatter_indices = LiteralUtil::CreateR1<int32>({1, 1});
@@ -2386,7 +2388,7 @@ ENTRY main {
       index_vector_dim=2
 }
 )";
-  ParseAndVerifyModule(hlo_text);
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
   Literal operand =
       LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
   Literal scatter_indices = LiteralUtil::CreateR2<int32>({{0, 2}, {2, 1}});
@@ -2418,7 +2420,7 @@ ENTRY main {
       index_vector_dim=1
 }
 )";
-  ParseAndVerifyModule(hlo_text);
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
   Literal operand =
       LiteralUtil::CreateR3<int32>({{{-1, 1}, {-2, 2}, {-3, 3}},  //
                                     {{-4, 4}, {-5, 5}, {-6, 6}},  //
@@ -2455,7 +2457,7 @@ ENTRY main {
       index_vector_dim=0
 }
 )";
-  ParseAndVerifyModule(hlo_text);
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
   Literal operand =
       LiteralUtil::CreateR3<int32>({{{-1, 1}, {-2, 2}, {-3, 3}},  //
                                     {{-4, 4}, {-5, 5}, {-6, 6}},  //
@@ -2491,7 +2493,7 @@ ENTRY main {
       index_vector_dim=0
 }
 )";
-  ParseAndVerifyModule(hlo_text);
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
   Literal operand =
       LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
   Literal scatter_indices = LiteralUtil::CreateR1<int32>({1, 1});
@@ -2523,7 +2525,7 @@ ENTRY main {
       index_vector_dim=0
 }
 )";
-  ParseAndVerifyModule(hlo_text);
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
   Literal operand =
       LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
   Literal scatter_indices = LiteralUtil::CreateR2<int32>({{2, 1}, {1, 1}});
@@ -2555,7 +2557,7 @@ ENTRY main {
       index_vector_dim=1
 }
 )";
-  ParseAndVerifyModule(hlo_text);
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
   Literal operand = LiteralUtil::CreateR2<int32>({{}, {}, {}});
   Literal scatter_indices = LiteralUtil::CreateR1<int32>({0, 2});
   Literal updates = LiteralUtil::CreateR2<int32>({{}, {}});
@@ -2585,7 +2587,7 @@ ENTRY main {
       index_vector_dim=2
 }
 )";
-  ParseAndVerifyModule(hlo_text);
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
 
   Literal operand = LiteralUtil::CreateR1<int32>({0, 1, 2});
   Literal scatter_indices =
@@ -2736,7 +2738,7 @@ ENTRY main {
   ROOT %reduce = bf16[] reduce(arg0, init), dimensions={0}, to_apply=add_bf16
 }
 )";
-  ParseAndVerifyModule(hlo_text);
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
 
   Literal arg = LiteralUtil::CreateR1<bfloat16>(
       {bfloat16(1.0f), bfloat16(3.0f), bfloat16(-2.0f), bfloat16(42.0f)});
@@ -2754,7 +2756,7 @@ ENTRY main {
   ROOT %slice = f32[2,2,2]{1,0,2} slice(f32[2,2,2]{0,1,2} %arg), slice={[0:2], [0:2], [0:2]}
 }
 )";
-  ParseAndVerifyModule(hlo_text);
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
 
   Literal arg = LiteralUtil::CreateR3WithLayout<float>(
       {{{1.0f, 2.0f}, {3.0f, 4.0f}}, {{5.0f, 6.0f}, {7.0f, 8.0f}}},
diff --git a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
index 13a74fd8a11..05cc1593e4e 100644
--- a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
+++ b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
@@ -1043,6 +1043,7 @@ ColorScheme HloDotDumper::GetInstructionColor(const HloInstruction* instr) {
     case HloOpcode::kDomain:
     case HloOpcode::kFusion:
     case HloOpcode::kMap:
+    case HloOpcode::kGetDimensionSize:
       return kGray;
     case HloOpcode::kCrossReplicaSum:
     case HloOpcode::kAllToAll:
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.cc b/tensorflow/compiler/xla/service/hlo_instruction.cc
index f6ed86b4165..26786ee950b 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction.cc
@@ -312,6 +312,10 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
                                 proto.exponent_bits(), proto.mantissa_bits());
       break;
     case HloOpcode::kInfeed: {
+      TF_RET_CHECK(ShapeUtil::IsTuple(proto.shape()) &&
+                   (ShapeUtil::TupleElementCount(proto.shape()) == 2))
+          << "Infeed should have a tuple shape with 2 operands, but has: "
+          << proto.shape();
       const Shape& data_shape =
           ShapeUtil::GetTupleElementShape(proto.shape(), 0);
       TF_RET_CHECK(proto.operand_ids_size() == 1)
@@ -530,6 +534,12 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
           absl::make_unique<ShardingMetadata>(exit_hlo_sharding));
       break;
     }
+    case HloOpcode::kGetDimensionSize:
+      TF_RET_CHECK(proto.operand_ids_size() == 1);
+      TF_RET_CHECK(proto.dimensions_size() == 1);
+      instruction = CreateGetDimensionSize(proto.shape(), operands(0),
+                                           proto.dimensions(0));
+      break;
     default: {
       instruction = absl::WrapUnique(new HloInstruction(opcode, proto.shape()));
       for (const int64 operand_id : proto.operand_ids()) {
@@ -1001,6 +1011,14 @@ HloInstruction::CreateSelectAndScatter(
                                                     broadcast_dimensions);
 }
 
+/* static */ std::unique_ptr<HloInstruction>
+HloInstruction::CreateGetDimensionSize(const Shape& shape,
+                                       HloInstruction* operand,
+                                       int64 dimension) {
+  return absl::make_unique<HloGetDimensionSizeInstruction>(shape, operand,
+                                                           dimension);
+}
+
 /* static */ std::unique_ptr<HloInstruction>
 HloInstruction::CreateBroadcastSequence(
     const Shape& output_shape, HloInstruction* operand,
@@ -1109,7 +1127,11 @@ void HloInstruction::set_single_sharding(const HloSharding& sharding) {
 
 void HloInstruction::SetupDerivedInstruction(
     HloInstruction* derived_instruction) const {
-  if (sharding_ != nullptr) {
+  if (sharding_ != nullptr && ShapeUtil::CompatibleIgnoringElementType(
+                                  shape_, derived_instruction->shape())) {
+    // Only copy sharding if the shape of the two instruction is compatible
+    // because copying it between differently shaped instructions can produce
+    // invalid shardings.
     derived_instruction->set_sharding(*sharding_);
   } else {
     derived_instruction->clear_sharding();
@@ -1268,6 +1290,7 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneWithNewOperands(
     case HloOpcode::kIota:
     case HloOpcode::kDot:
     case HloOpcode::kDomain:
+    case HloOpcode::kGetDimensionSize:
       clone = CloneWithNewOperandsImpl(shape, new_operands, context);
       break;
     // Unary ops.
@@ -1715,6 +1738,7 @@ bool HloInstruction::IdenticalSlowPath(
     case HloOpcode::kScatter:
     case HloOpcode::kDot:
     case HloOpcode::kDomain:
+    case HloOpcode::kGetDimensionSize:
       LOG(FATAL) << "Base class impl called for opcode with subclass: "
                  << opcode();
   }
@@ -2440,6 +2464,8 @@ Status HloInstruction::Visit(DfsHloVisitorBase<HloInstructionPtr>* visitor) {
       return visitor->HandleAfterAll(this);
     case HloOpcode::kIota:
       return visitor->HandleIota(this);
+    case HloOpcode::kGetDimensionSize:
+      return visitor->HandleGetDimensionSize(this);
 
     // These opcodes are not handled here.
     case HloOpcode::kTrace:
@@ -2639,49 +2665,7 @@ Status HloInstruction::Accept(
   return this->Accept(&visitor);
 }
 
-Status HloInstruction::AcceptOrdered(
-    DfsHloVisitor* visitor, const std::vector<const HloInstruction*>& order) {
-  VLOG(2) << "HloInstruction::AcceptOrdered(%" << name() << ")";
-  TF_RET_CHECK(OrderIsTopologicalSort(order));
-
-  // Compute the predecessors of this instruction.
-  std::unordered_set<const HloInstruction*> predecessors;
-  TF_RETURN_IF_ERROR(this->Accept([&predecessors](HloInstruction* instruction) {
-    predecessors.insert(instruction);
-    return Status::OK();
-  }));
-
-  for (auto* const_instruction : order) {
-    if (!ContainsKey(predecessors, const_instruction)) {
-      // Instruction is not a predecessors of 'this'.
-      continue;
-    }
-
-    // The visitor can mark instructions as visited to skip particular
-    // instructions.
-    if (visitor->DidVisit(*const_instruction)) {
-      VLOG(3) << "Not visiting HLO %" << const_instruction->name()
-              << " as it was already visited.";
-      continue;
-    }
-
-    // TODO(b/78350259): Eliminate const laundering.
-    HloInstruction* instruction =
-        const_cast<HloInstruction*>(const_instruction);
-
-    TF_RETURN_IF_ERROR(visitor->Preprocess(instruction));
-    VLOG(2) << "Visiting HLO %" << instruction->name();
-    TF_RETURN_IF_ERROR(instruction->Visit(visitor));
-    visitor->SetVisited(*instruction);
-    TF_RETURN_IF_ERROR(visitor->Postprocess(instruction));
-  }
-
-  return visitor->FinishVisit(this);
-}
-
-const Shape& HloInstruction::shape() const {
-  return shape_;
-}
+const Shape& HloInstruction::shape() const { return shape_; }
 
 std::vector<int64> HloInstruction::OperandIndices(
     const HloInstruction* operand) const {
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.h b/tensorflow/compiler/xla/service/hlo_instruction.h
index 15a4da8dbe0..818d4ede0f3 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.h
+++ b/tensorflow/compiler/xla/service/hlo_instruction.h
@@ -767,6 +767,9 @@ class HloInstruction {
   // when we plumb a primordial token from the entry computation.
   static std::unique_ptr<HloInstruction> CreateToken();
 
+  static std::unique_ptr<HloInstruction> CreateGetDimensionSize(
+      const Shape& shape, HloInstruction* operand, int64 dimension);
+
   // Returns the opcode for this instruction.
   HloOpcode opcode() const { return opcode_; }
 
@@ -954,16 +957,6 @@ class HloInstruction {
   Status Accept(
       const std::function<Status(const HloInstruction*)>& visitor_func) const;
 
-  // Visits all instructions rooted at this instruction using the given visitor
-  // in the given order. 'order' must contain at least the set of instructions
-  // rooted at this node (ie, those accessible from a DFS traversal from this
-  // instruction). Instructions contained in 'order' which are not in the set of
-  // instructions rooted at this node are ignored. 'order' must also be a valid
-  // topological sort of these instructions (defs appear before uses) though
-  // need not be a DFS post-order.
-  Status AcceptOrdered(DfsHloVisitor* visitor,
-                       const std::vector<const HloInstruction*>& order);
-
   // Visit this instruction and only this instruction with the given visitor.
   template <typename HloInstructionPtr>
   Status Visit(DfsHloVisitorBase<HloInstructionPtr>* visitor);
diff --git a/tensorflow/compiler/xla/service/hlo_instruction_test.cc b/tensorflow/compiler/xla/service/hlo_instruction_test.cc
index d93351fe043..8048e332cb5 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction_test.cc
@@ -29,7 +29,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
-#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/window_util.h"
 
@@ -39,7 +39,7 @@ namespace {
 using ::testing::ElementsAre;
 using ::testing::UnorderedElementsAre;
 
-class HloInstructionTest : public HloVerifiedTestBase {
+class HloInstructionTest : public HloTestBase {
  protected:
   Shape r0f32_ = ShapeUtil::MakeShape(F32, {});
 };
@@ -151,7 +151,7 @@ TEST_F(HloInstructionTest, UserWithTwoOperands) {
       builder.AddInstruction(HloInstruction::CreateParameter(1, r0f32_, "bar"));
   auto add = builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32_, HloOpcode::kAdd, foo, bar));
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(add->operands(), UnorderedElementsAre(foo, bar));
@@ -188,7 +188,7 @@ TEST_F(HloInstructionTest, MultipleUsers) {
       HloInstruction::CreateUnary(r0f32_, HloOpcode::kExp, foo));
   auto add = builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32_, HloOpcode::kAdd, foo, bar));
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   EXPECT_EQ(3, foo->user_count());
@@ -221,7 +221,7 @@ TEST_F(HloInstructionTest, RepeatedUser) {
       builder.AddInstruction(HloInstruction::CreateParameter(0, r0f32_, "foo"));
   auto add = builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32_, HloOpcode::kAdd, foo, foo));
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   EXPECT_EQ(1, foo->user_count());
@@ -256,7 +256,7 @@ TEST_F(HloInstructionTest, MultipleUsersAndOperands) {
       HloInstruction::CreateBinary(r0f32_, HloOpcode::kAdd, c0, param1));
   auto addtotal = builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32_, HloOpcode::kAdd, addleft, addright));
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   OpAndUserCollectingVisitor visitor;
@@ -305,7 +305,7 @@ TEST_F(HloInstructionTest, MultipleUsersAndOperandsWithUnaryOps) {
       HloInstruction::CreateBinary(r0f32_, HloOpcode::kAdd, addleft, addright));
   auto neg2 = builder.AddInstruction(
       HloInstruction::CreateUnary(r0f32_, HloOpcode::kNegate, addtotal));
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   OpAndUserCollectingVisitor visitor;
@@ -327,7 +327,7 @@ TEST_F(HloInstructionTest, TrivialMap) {
   //
   Shape r0f32 = ShapeUtil::MakeShape(F32, {});
   Shape f32a100x10 = ShapeUtil::MakeShape(F32, {100, 10});
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
 
   // Builds an x+1.0 computation to use in a Map.
   auto embedded_builder = HloComputation::Builder("f32+1");
@@ -375,7 +375,7 @@ TEST_F(HloInstructionTest, TrivialReduce) {
       HloInstruction::CreateParameter(1, r0f32, "y"));
   embedded_builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32, HloOpcode::kAdd, paramx, paramy));
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto add_f32 = module->AddEmbeddedComputation(embedded_builder.Build());
 
   // Builds a parameter and an initial value and feeds them to the reduce.
@@ -416,7 +416,7 @@ TEST_F(HloInstructionTest, ReplaceUseInBinaryOps) {
       HloInstruction::CreateBinary(r0f32_, HloOpcode::kAdd, foo, foo));
   builder.AddInstruction(HloInstruction::CreateBinary(r0f32_, HloOpcode::kAdd,
                                                       add_foobar, add_foofoo));
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   EXPECT_EQ(2, foo->user_count());
@@ -451,7 +451,7 @@ TEST_F(HloInstructionTest, ReplaceUseInVariadicOp) {
       builder.AddInstruction(HloInstruction::CreateTuple({foo, bar, baz, foo}));
   auto add_foobar = builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32_, HloOpcode::kAdd, foo, bar));
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   EXPECT_EQ(2, foo->user_count());
@@ -479,7 +479,7 @@ TEST_F(HloInstructionTest, ReplaceUseInUnaryOp) {
       HloInstruction::CreateUnary(r0f32_, HloOpcode::kExp, foo));
   auto log = builder.AddInstruction(
       HloInstruction::CreateUnary(r0f32_, HloOpcode::kLog, foo));
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   EXPECT_EQ(2, foo->user_count());
@@ -516,7 +516,7 @@ TEST_F(HloInstructionTest, ReplaceAllUsesWithInBinaryOps) {
       HloInstruction::CreateBinary(r0f32_, HloOpcode::kAdd, foo, foo));
   builder.AddInstruction(HloInstruction::CreateBinary(r0f32_, HloOpcode::kAdd,
                                                       add_foobar, add_foofoo));
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   EXPECT_EQ(2, foo->user_count());
@@ -546,7 +546,7 @@ TEST_F(HloInstructionTest, ReplaceAllUsesInMultipleOps) {
   auto exp = builder.AddInstruction(
       HloInstruction::CreateUnary(r0f32_, HloOpcode::kExp, foo));
   auto tuple = builder.AddInstruction(HloInstruction::CreateTuple({foo, bar}));
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   EXPECT_EQ(3, foo->user_count());
@@ -611,7 +611,7 @@ TEST_F(HloInstructionTest, PostProcessAllVisitedNodes) {
       HloInstruction::CreateUnary(r0f32_, HloOpcode::kLog, foo));
   auto add = builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32_, HloOpcode::kAdd, exp, log));
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   NodeCollectorAndPostProcessor visitor;
@@ -629,7 +629,7 @@ TEST_F(HloInstructionTest, SingletonFusionOp) {
       HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.1f)));
   auto exp = builder.AddInstruction(
       HloInstruction::CreateUnary(r0f32_, HloOpcode::kExp, constant));
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto* computation = module->AddEntryComputation(builder.Build());
   auto* fusion = computation->CreateFusionInstruction(
       {exp}, HloInstruction::FusionKind::kLoop);
@@ -647,7 +647,7 @@ TEST_F(HloInstructionTest, BinaryFusionOp) {
       HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(42.1f)));
   auto add = builder.AddInstruction(HloInstruction::CreateBinary(
       r0f32_, HloOpcode::kAdd, constant1, constant2));
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto* computation = module->AddEntryComputation(builder.Build());
   auto* fusion = computation->CreateFusionInstruction(
       {add}, HloInstruction::FusionKind::kLoop);
@@ -669,7 +669,7 @@ TEST_F(HloInstructionTest, ChainFusionOp) {
   auto exp3 = builder.AddInstruction(
       HloInstruction::CreateUnary(r0f32_, HloOpcode::kExp, exp2));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto* computation = module->AddEntryComputation(builder.Build());
   auto* fusion = computation->CreateFusionInstruction(
       {exp3, exp2, exp1}, HloInstruction::FusionKind::kLoop);
@@ -692,7 +692,7 @@ TEST_F(HloInstructionTest, PreserveMetadataInFusionAndClone) {
   exp1->set_metadata(metadata);
   exp2->set_metadata(metadata);
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto* computation = module->AddEntryComputation(builder.Build());
   auto* fusion = computation->CreateFusionInstruction(
       {exp2, exp1}, HloInstruction::FusionKind::kLoop);
@@ -749,7 +749,7 @@ TEST_F(HloInstructionTest, PreserveTupleShapeThroughClone) {
 TEST_F(HloInstructionTest, FusionOpWithCalledComputations) {
   // Create a fusion instruction containing a single unary operation.
   const Shape scalar_shape = ShapeUtil::MakeShape(F32, {});
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
 
   auto make_map_computation = [&]() {
     auto builder = HloComputation::Builder("FusionMap");
@@ -817,7 +817,7 @@ TEST_F(HloInstructionTest, ComplexFusionOp) {
   auto tuple =
       builder.AddInstruction(HloInstruction::CreateTuple({sub, sub, mul, c1}));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto* computation = module->AddEntryComputation(builder.Build());
   auto* fusion = computation->CreateFusionInstruction(
       {tuple, sub, mul, exp, clamp, add}, HloInstruction::FusionKind::kLoop);
@@ -977,7 +977,7 @@ TEST_F(HloInstructionTest, FunctionVisitor) {
       HloInstruction::CreateUnary(f32, HloOpcode::kExp, param));
   auto add = builder.AddInstruction(
       HloInstruction::CreateBinary(f32, HloOpcode::kAdd, negate, exp));
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   int visit_num = 0;
@@ -1006,7 +1006,7 @@ TEST_F(HloInstructionTest, FullyElementwise) {
       builder.AddInstruction(HloInstruction::CreateParameter(1, r1f32, "y"));
   auto add = builder.AddInstruction(
       HloInstruction::CreateBinary(r1f32, HloOpcode::kAdd, x, y));
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   EXPECT_TRUE(add->IsElementwise());
@@ -1016,7 +1016,7 @@ TEST_F(HloInstructionTest, FullyElementwise) {
 }
 
 TEST_F(HloInstructionTest, MapIsElementwise) {
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   const Shape r2f32 = ShapeUtil::MakeShapeWithLayout(F32, {10, 10}, {1, 0});
   HloComputation::Builder builder(TestName());
   HloComputation::Builder map_builder("id");
@@ -1067,7 +1067,7 @@ TEST_F(HloInstructionTest, PartiallyElementwise) {
   HloInstruction* max = builder.AddInstruction(
       HloInstruction::CreateBinary(r2f32, HloOpcode::kMaximum, div, broadcast));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto* computation = module->AddEntryComputation(builder.Build());
   HloInstruction* fusion = computation->CreateFusionInstruction(
       {max, broadcast, div, mul}, HloInstruction::FusionKind::kLoop);
@@ -1108,7 +1108,7 @@ TEST_F(HloInstructionTest, PartiallyElementwiseWithReuse) {
   HloInstruction* sub = builder.AddInstruction(HloInstruction::CreateBinary(
       r1f32, HloOpcode::kSubtract, min, broadcast));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto* computation = module->AddEntryComputation(builder.Build());
   HloInstruction* fusion = computation->CreateFusionInstruction(
       {sub, broadcast, min}, HloInstruction::FusionKind::kLoop);
@@ -1151,7 +1151,7 @@ TEST_F(HloInstructionTest, CloneOfFusionPreservesShape) {
   HloInstruction* dot = builder.AddInstruction(HloInstruction::CreateDot(
       sout, x, reshape, dot_dnums, DefaultPrecisionConfig(2)));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto* computation = module->AddEntryComputation(builder.Build());
   HloInstruction* fusion = computation->CreateFusionInstruction(
       {dot, reshape}, HloInstruction::FusionKind::kLoop);
@@ -1192,7 +1192,7 @@ TEST_F(HloInstructionTest, NoRedundantFusionOperandsAfterReplacingUse) {
   HloInstruction* dot = builder.AddInstruction(HloInstruction::CreateDot(
       s, x, reshape, dot_dnums, DefaultPrecisionConfig(2)));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto* computation = module->AddEntryComputation(builder.Build());
   HloInstruction* fusion = computation->CreateFusionInstruction(
       {dot, reshape}, HloInstruction::FusionKind::kLoop);
@@ -1204,7 +1204,7 @@ TEST_F(HloInstructionTest, NoRedundantFusionOperandsAfterReplacingUse) {
 }
 
 TEST_F(HloInstructionTest, FusionEquality) {
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
 
   // Create two fusion instructions containing a single unary operation.
@@ -1226,7 +1226,7 @@ TEST_F(HloInstructionTest, FusionEquality) {
 }
 
 TEST_F(HloInstructionTest, NestedFusionEquality) {
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
 
   // Build a nested fusion computation.
@@ -1330,7 +1330,7 @@ TEST_F(HloInstructionTest, Stringification) {
             "%dot = f32[5,20]{1,0} dot(f32[5,10]{1,0} %x, f32[10,20]{1,0} "
             "%transpose), lhs_contracting_dims={1}, rhs_contracting_dims={0}");
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto* computation = module->AddEntryComputation(builder.Build());
 
   HloInstruction* loop = builder.AddInstruction(
@@ -1373,7 +1373,7 @@ TEST_F(HloInstructionTest, StringifyGather_0) {
                                        /*index_vector_dim=*/4),
                                    /*slice_sizes=*/{30, 29, 28, 27, 26}));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   EXPECT_EQ(gather_instruction->ToString(),
@@ -1408,7 +1408,7 @@ TEST_F(HloInstructionTest, StringifyGather_1) {
                                        /*index_vector_dim=*/2),
                                    /*slice_sizes=*/{30, 29, 28, 27, 26}));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   EXPECT_EQ(gather_instruction->ToString(),
@@ -1443,7 +1443,7 @@ TEST_F(HloInstructionTest, StringifyScatter) {
   update_builder.AddInstruction(
       HloInstruction::CreateParameter(1, ShapeUtil::MakeShape(F32, {}), "p2"));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto* update_computation =
       module->AddEmbeddedComputation(update_builder.Build());
 
@@ -1495,7 +1495,7 @@ TEST_F(HloInstructionTest, CanonnicalStringificationFusion) {
             "f32[5,20]{1,0} dot(f32[5,10]{1,0}, f32[10,20]{1,0}), "
             "lhs_contracting_dims={1}, rhs_contracting_dims={0}");
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto* computation = module->AddEntryComputation(builder.Build());
   HloInstruction* fusion = computation->CreateFusionInstruction(
       {dot, reshape}, HloInstruction::FusionKind::kLoop);
@@ -1531,7 +1531,7 @@ TEST_F(HloInstructionTest, CanonnicalStringificationWhile) {
   HloInstruction* dot = builder.AddInstruction(HloInstruction::CreateDot(
       sout, x, reshape, dot_dnums, DefaultPrecisionConfig(2)));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto* computation = module->AddEntryComputation(builder.Build());
   computation->CreateFusionInstruction({dot, reshape},
                                        HloInstruction::FusionKind::kLoop);
@@ -1587,7 +1587,7 @@ TEST_F(HloInstructionTest, CanonnicalStringificationConditional) {
   HloInstruction* dot = builder.AddInstruction(HloInstruction::CreateDot(
       sout, x, reshape, dot_dnums, DefaultPrecisionConfig(2)));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto* computation = module->AddEntryComputation(builder.Build());
   computation->CreateFusionInstruction({dot, reshape},
                                        HloInstruction::FusionKind::kLoop);
diff --git a/tensorflow/compiler/xla/service/hlo_instructions.cc b/tensorflow/compiler/xla/service/hlo_instructions.cc
index 88495e80000..4c765aa375c 100644
--- a/tensorflow/compiler/xla/service/hlo_instructions.cc
+++ b/tensorflow/compiler/xla/service/hlo_instructions.cc
@@ -2349,4 +2349,43 @@ HloInstructionProto HloDomainInstruction::ToProto() const {
 
   return proto;
 }
+
+HloGetDimensionSizeInstruction::HloGetDimensionSizeInstruction(
+    const Shape& shape, HloInstruction* operand, int64 dimension)
+    : HloInstruction(HloOpcode::kGetDimensionSize, shape),
+      dimension_(dimension) {
+  AppendOperand(operand);
+}
+
+HloInstructionProto HloGetDimensionSizeInstruction::ToProto() const {
+  HloInstructionProto proto = HloInstruction::ToProto();
+  proto.add_dimensions(dimension());
+  return proto;
+}
+
+std::vector<string> HloGetDimensionSizeInstruction::ExtraAttributesToStringImpl(
+    const HloPrintOptions& /*options*/) const {
+  return {StrCat("dimensions={", dimension(), "}")};
+}
+
+bool HloGetDimensionSizeInstruction::IdenticalSlowPath(
+    const HloInstruction& other,
+    const std::function<bool(const HloComputation*, const HloComputation*)>&
+    /*eq_computations*/) const {
+  const auto& casted_other =
+      static_cast<const HloGetDimensionSizeInstruction&>(other);
+  return dimension() == casted_other.dimension();
+}
+
+std::unique_ptr<HloInstruction>
+HloGetDimensionSizeInstruction::CloneWithNewOperandsImpl(
+    const Shape& shape, absl::Span<HloInstruction* const> new_operands,
+    HloCloneContext* /*context*/) const {
+  if (new_operands.size() != 1) {
+    LOG(FATAL) << "expects 1 operand";
+  }
+  return absl::make_unique<HloGetDimensionSizeInstruction>(
+      shape, new_operands[0], dimension());
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_instructions.h b/tensorflow/compiler/xla/service/hlo_instructions.h
index bf4daf2be47..d43a8973ccf 100644
--- a/tensorflow/compiler/xla/service/hlo_instructions.h
+++ b/tensorflow/compiler/xla/service/hlo_instructions.h
@@ -1385,6 +1385,33 @@ class HloDomainInstruction : public HloInstruction {
   std::unique_ptr<DomainMetadata> operand_side_metadata_;
   std::unique_ptr<DomainMetadata> user_side_metadata_;
 };
+
+class HloGetDimensionSizeInstruction : public HloInstruction {
+ public:
+  explicit HloGetDimensionSizeInstruction(const Shape& shape,
+                                          HloInstruction* operand,
+                                          int64 dimension);
+
+  // Returns the dimension sizes or numbers associated with this instruction.
+  int64 dimension() const { return dimension_; }
+  // Returns a serialized representation of this instruction.
+  HloInstructionProto ToProto() const override;
+
+ private:
+  std::vector<string> ExtraAttributesToStringImpl(
+      const HloPrintOptions& options) const override;
+  bool IdenticalSlowPath(
+      const HloInstruction& other,
+      const std::function<bool(const HloComputation*, const HloComputation*)>&
+          eq_computations) const override;
+  // Implementation for non-common logic of CloneWithNewOperands.
+  std::unique_ptr<HloInstruction> CloneWithNewOperandsImpl(
+      const Shape& shape, absl::Span<HloInstruction* const> new_operands,
+      HloCloneContext* context) const override;
+
+  int64 dimension_;
+};
+
 }  // namespace xla
 
 #endif  // TENSORFLOW_COMPILER_XLA_SERVICE_HLO_INSTRUCTIONS_H_
diff --git a/tensorflow/compiler/xla/service/hlo_matchers.h b/tensorflow/compiler/xla/service/hlo_matchers.h
index 1717770301e..170ec93a334 100644
--- a/tensorflow/compiler/xla/service/hlo_matchers.h
+++ b/tensorflow/compiler/xla/service/hlo_matchers.h
@@ -165,6 +165,7 @@ namespace opcode_matchers {
   }
 HLO_MATCHER(Abs);
 HLO_MATCHER(Add);
+HLO_MATCHER(AllToAll);
 HLO_MATCHER(Bitcast);
 HLO_MATCHER(Broadcast);
 HLO_MATCHER(BatchNormGrad);
@@ -178,6 +179,7 @@ HLO_MATCHER(Convert);
 HLO_MATCHER(Convolution);
 HLO_MATCHER(Copy);
 HLO_MATCHER(CrossReplicaSum);
+HLO_MATCHER(CollectivePermute);
 HLO_MATCHER(Divide);
 HLO_MATCHER(Domain);
 HLO_MATCHER(DynamicSlice);
diff --git a/tensorflow/compiler/xla/service/hlo_memory_scheduler_test.cc b/tensorflow/compiler/xla/service/hlo_memory_scheduler_test.cc
index 2f15997fc17..984a6266abb 100644
--- a/tensorflow/compiler/xla/service/hlo_memory_scheduler_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_memory_scheduler_test.cc
@@ -65,7 +65,7 @@ TEST_F(HloSchedulingTest, LastUseScheduledFirst) {
   auto sub = builder.AddInstruction(
       HloInstruction::CreateBinary(vec, HloOpcode::kSubtract, add, negate));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   module->AddEntryComputation(builder.Build());
 
   HloMemoryScheduler scheduler([](const BufferValue& buffer) {
@@ -172,7 +172,7 @@ TEST_F(HloSchedulingTest, TuplesAreAccountedCorrectly) {
   builder.AddInstruction(HloInstruction::CreateBinary(r1f32, HloOpcode::kAdd,
                                                       tuple_elm, abs_abs2));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   module->AddEntryComputation(builder.Build());
   TF_ASSERT_OK_AND_ASSIGN(HloSchedule schedule,
                           ScheduleModule(*module,
@@ -218,7 +218,7 @@ TEST_F(HloSchedulingTest, MultiOutputFusionAccountedCorrectly) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r1f32, HloOpcode::kAdd, tuple_elm, exp));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   auto* computation = module->AddEntryComputation(builder.Build());
 
   auto fusion = computation->CreateFusionInstruction(
@@ -242,7 +242,7 @@ TEST_F(HloSchedulingTest, MultiOutputFusionAccountedCorrectly) {
 }
 
 TEST_F(HloSchedulingTest, HeapSimulatorAccountsForSubcomputations) {
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   const Shape r1f32 = ShapeUtil::MakeShape(F32, {4});
 
   // param != 0
diff --git a/tensorflow/compiler/xla/service/hlo_module.cc b/tensorflow/compiler/xla/service/hlo_module.cc
index 6a838b7eb96..14bf17f4be1 100644
--- a/tensorflow/compiler/xla/service/hlo_module.cc
+++ b/tensorflow/compiler/xla/service/hlo_module.cc
@@ -559,7 +559,8 @@ std::unique_ptr<HloModule> HloModule::Clone(const string& suffix) const {
 std::unique_ptr<HloModule> HloModule::Clone(const HloModuleConfig& config,
                                             const string& suffix) const {
   VLOG(1) << "Cloning module :" << name_ << " --> " << suffix << "\n";
-  auto module = absl::make_unique<HloModule>(name_ + "-" + suffix, config);
+  auto module = absl::make_unique<HloModule>(
+      absl::StrCat(name_, suffix.empty() ? "" : "-", suffix), config);
 
   HloCloneContext context(module.get(), suffix);
   auto cloned_computation = entry_computation_->Clone(suffix, &context);
diff --git a/tensorflow/compiler/xla/service/hlo_module_test.cc b/tensorflow/compiler/xla/service/hlo_module_test.cc
index 39f38b417ab..3ae67e4e5ee 100644
--- a/tensorflow/compiler/xla/service/hlo_module_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_module_test.cc
@@ -63,7 +63,7 @@ class HloModuleTest : public HloTestBase {
 
 TEST_F(HloModuleTest, OneComputationPostOrder) {
   // Create a module with a single computation.
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   auto computation = module->AddEntryComputation(CreateConstantComputation());
 
   EXPECT_THAT(module->MakeComputationPostOrder(),
@@ -72,7 +72,7 @@ TEST_F(HloModuleTest, OneComputationPostOrder) {
 
 TEST_F(HloModuleTest, TwoComputationsPostOrder) {
   // Create a module with two unconnected computations.
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   auto computation1 = module->AddEntryComputation(CreateConstantComputation());
   auto computation2 =
       module->AddEmbeddedComputation(CreateConstantComputation());
@@ -88,7 +88,7 @@ TEST_F(HloModuleTest, TwoComputationsPostOrder) {
 
 TEST_F(HloModuleTest, CloneTest) {
   // Create and copy a module with a diamond call graph of computations.
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   auto computation1 =
       module->AddEmbeddedComputation(CreateConstantComputation());
   auto computation2 =
@@ -111,7 +111,7 @@ TEST_F(HloModuleTest, CloneTest) {
 }
 
 TEST_F(HloModuleTest, CloneHasFusion) {
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
 
   // Create the fused computation.
   HloComputation* fused_computation;
@@ -154,7 +154,7 @@ TEST_F(HloModuleTest, CloneHasFusion) {
 
 TEST_F(HloModuleTest, DiamondComputationsPostOrder) {
   // Create a module with a diamond call graph of computations.
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   auto computation1 =
       module->AddEmbeddedComputation(CreateConstantComputation());
   auto computation2 =
@@ -174,7 +174,7 @@ TEST_F(HloModuleTest, DiamondComputationsPostOrder) {
 
 TEST_F(HloModuleTest, LargeConstantToString) {
   // Create a module with a single computation.
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   auto builder = HloComputation::Builder("Constant");
   std::vector<float> values(16, 42.0);
   builder.AddInstruction(
@@ -194,8 +194,8 @@ TEST_F(HloModuleTest, LargeConstantToString) {
 }
 
 TEST_F(HloModuleTest, UniqueModuleId) {
-  auto module_a = CreateNewModule();
-  auto module_b = CreateNewModule();
+  auto module_a = CreateNewUnverifiedModule();
+  auto module_b = CreateNewUnverifiedModule();
   EXPECT_NE(module_a->unique_id(), module_b->unique_id());
 }
 
diff --git a/tensorflow/compiler/xla/service/hlo_opcode.h b/tensorflow/compiler/xla/service/hlo_opcode.h
index e6bfb8025d4..70c7d70b41c 100644
--- a/tensorflow/compiler/xla/service/hlo_opcode.h
+++ b/tensorflow/compiler/xla/service/hlo_opcode.h
@@ -83,6 +83,7 @@ namespace xla {
   V(kFusion, "fusion", kHloOpcodeIsVariadic)                 \
   V(kGather, "gather")                                       \
   V(kGe, "greater-than-or-equal-to", kHloOpcodeIsComparison) \
+  V(kGetDimensionSize, "get-dimension-size")                 \
   V(kAfterAll, "after-all", kHloOpcodeIsVariadic)            \
   V(kGetTupleElement, "get-tuple-element")                   \
   V(kGt, "greater-than", kHloOpcodeIsComparison)             \
diff --git a/tensorflow/compiler/xla/service/hlo_ordering.cc b/tensorflow/compiler/xla/service/hlo_ordering.cc
index 23d41d91d69..f5f99bece18 100644
--- a/tensorflow/compiler/xla/service/hlo_ordering.cc
+++ b/tensorflow/compiler/xla/service/hlo_ordering.cc
@@ -334,7 +334,7 @@ DependencyHloOrdering::DependencyHloOrdering(const HloModule* module)
   // ordering based on dependencies. ExecutesBefore will return true iff there
   // exists a path in the HLO computation graph from 'a' to 'b'.
   for (auto* computation : module->MakeNonfusionComputations()) {
-    predecessors_.emplace(computation, computation->ComputeReachability());
+    predecessors_.emplace(computation, HloReachabilityMap::Build(computation));
   }
 }
 
@@ -374,11 +374,10 @@ bool SequentialHloOrdering::ExecutesBeforeInSameComputation(
   return order_position_.at(a) < order_position_.at(b);
 }
 
-const std::vector<const HloInstruction*>*
-SequentialHloOrdering::SequentialOrder(
+const HloInstructionSequence* SequentialHloOrdering::SequentialOrder(
     const HloComputation& computation) const {
   return schedule_.is_computation_scheduled(&computation)
-             ? &schedule_.sequence(&computation).instructions()
+             ? &schedule_.sequence(&computation)
              : nullptr;
 }
 
diff --git a/tensorflow/compiler/xla/service/hlo_ordering.h b/tensorflow/compiler/xla/service/hlo_ordering.h
index 66313492eb2..a07214c22c0 100644
--- a/tensorflow/compiler/xla/service/hlo_ordering.h
+++ b/tensorflow/compiler/xla/service/hlo_ordering.h
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_dataflow_analysis.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_reachability.h"
 #include "tensorflow/compiler/xla/service/hlo_schedule.h"
 #include "tensorflow/compiler/xla/service/hlo_value.h"
 #include "tensorflow/compiler/xla/types.h"
@@ -64,7 +65,7 @@ class HloOrdering {
 
   // Returns the sequential instruction order for the given computation, or
   // nullptr if the computation does not have a sequential ordering.
-  virtual const std::vector<const HloInstruction*>* SequentialOrder(
+  virtual const HloInstructionSequence* SequentialOrder(
       const HloComputation& computation) const = 0;
 
   // Return the call graph of the module used to compute ordering.
@@ -96,7 +97,7 @@ class PredecessorHloOrdering : public HloOrdering {
 
   // Returns nullptr indicating the computation does not have a sequential
   // ordering.
-  const std::vector<const HloInstruction*>* SequentialOrder(
+  const HloInstructionSequence* SequentialOrder(
       const HloComputation& computation) const override {
     return nullptr;
   }
@@ -185,7 +186,7 @@ class SequentialHloOrdering : public HloOrdering {
   ~SequentialHloOrdering() override = default;
 
   // Returns the sequential instruction order for the given computation.
-  const std::vector<const HloInstruction*>* SequentialOrder(
+  const HloInstructionSequence* SequentialOrder(
       const HloComputation& computation) const override;
 
   string ToString() const override;
diff --git a/tensorflow/compiler/xla/service/hlo_ordering_test.cc b/tensorflow/compiler/xla/service/hlo_ordering_test.cc
index b045adc9640..2ab8aa57f6e 100644
--- a/tensorflow/compiler/xla/service/hlo_ordering_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_ordering_test.cc
@@ -53,7 +53,7 @@ TEST_F(HloOrderingTest, InstructionsInDifferentComputations) {
   //   %c = Constant(42.0f)
   //
   // This results in a diamond-shaped callgraph.
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   const Shape scalar_shape = ShapeUtil::MakeShape(xla::F32, {});
 
   auto builder_c = HloComputation::Builder("C");
@@ -126,7 +126,7 @@ TEST_F(HloOrderingTest, InstructionsInWhileComputations) {
   //   %constant = Constant(1.0)
   //   return While(%constant, body, condition)
   //
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   const Shape scalar_shape = ShapeUtil::MakeShape(xla::F32, {});
 
   auto body_builder = HloComputation::Builder("body");
@@ -176,7 +176,7 @@ TEST_F(HloOrderingTest, InstructionsInWhileComputations) {
 
 TEST_F(HloOrderingTest, ParametersDefinedBeforeOthers) {
   // Entry parameter should always be defined before other instruction.
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   const Shape scalar_shape = ShapeUtil::MakeShape(xla::F32, {});
   auto builder = HloComputation::Builder(TestName());
   auto constant = builder.AddInstruction(
@@ -209,7 +209,7 @@ TEST_F(HloOrderingTest, ValuesInWhileComputations) {
   //   %while = While(%constant, body, condition)
   //   %add = Add(%constant, %while)
   //
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   const Shape scalar_shape = ShapeUtil::MakeShape(xla::F32, {});
 
   auto body_builder = HloComputation::Builder("body");
@@ -407,7 +407,7 @@ TEST_F(HloOrderingTest,
   //   %dead = Constant(123.0)
   //
   // %root should interfere with %dead.
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   const Shape scalar_shape = ShapeUtil::MakeShape(xla::F32, {});
 
   auto builder = HloComputation::Builder(TestName());
@@ -455,7 +455,7 @@ TEST_F(HloOrderingTest,
   //   ROOT %call = call({%c}), subcomputation
   //
   // %root should interfere with %dead.
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   const Shape scalar_shape = ShapeUtil::MakeShape(xla::F32, {});
 
   auto subbuilder = HloComputation::Builder(TestName() + ".sub");
diff --git a/tensorflow/compiler/xla/service/hlo_parser.cc b/tensorflow/compiler/xla/service/hlo_parser.cc
index 450660b94b7..4390145c6bd 100644
--- a/tensorflow/compiler/xla/service/hlo_parser.cc
+++ b/tensorflow/compiler/xla/service/hlo_parser.cc
@@ -108,7 +108,7 @@ class HloParser {
   bool ParseInstructionList(HloComputation** computation,
                             const string& computation_name);
   bool ParseInstruction(HloComputation::Builder* builder, string* root_name);
-  bool ParseInstruciontRhs(HloComputation::Builder* builder, const string& name,
+  bool ParseInstructionRhs(HloComputation::Builder* builder, const string& name,
                            LocTy name_loc);
   bool ParseControlPredecessors(HloInstruction* instruction);
   bool ParseLiteral(Literal* literal, const Shape& shape);
@@ -608,10 +608,10 @@ bool HloParser::ParseInstruction(HloComputation::Builder* builder,
     *root_name = name;
   }
 
-  return ParseInstruciontRhs(builder, name, name_loc);
+  return ParseInstructionRhs(builder, name, name_loc);
 }
 
-bool HloParser::ParseInstruciontRhs(HloComputation::Builder* builder,
+bool HloParser::ParseInstructionRhs(HloComputation::Builder* builder,
                                     const string& name, LocTy name_loc) {
   Shape shape;
   HloOpcode opcode;
@@ -1547,6 +1547,18 @@ bool HloParser::ParseInstruciontRhs(HloComputation::Builder* builder,
     case HloOpcode::kTrace:
       return TokenError(StrCat("parsing not yet implemented for op: ",
                                HloOpcodeString(opcode)));
+    case HloOpcode::kGetDimensionSize:
+      optional<std::vector<tensorflow::int64>> dimensions;
+      attrs["dimensions"] = {/*required=*/true, AttrTy::kBracedInt64List,
+                             &dimensions};
+      if (!ParseOperands(&operands, /*expected_size=*/1) ||
+          !ParseAttributes(attrs)) {
+        return false;
+      }
+      instruction =
+          builder->AddInstruction(HloInstruction::CreateGetDimensionSize(
+              shape, operands[0], (*dimensions)[0]));
+      break;
   }
 
   instruction->SetAndSanitizeName(name);
@@ -2708,7 +2720,7 @@ bool HloParser::ParseConvolutionDimensionNumbers(
 
   // The str is expected to have 3 items, lhs, rhs, out, and it must look like
   // lhs_rhs->out, that is, the first separator is "_" and the second is "->".
-  std::vector<string> split1 = absl::StrSplit(str, "_");
+  std::vector<string> split1 = absl::StrSplit(str, '_');
   if (split1.size() != 2) {
     LOG(FATAL) << "expects 3 items: lhs, rhs, and output dims, but sees "
                << str;
@@ -3389,7 +3401,7 @@ bool HloParser::ParseSingleInstruction(HloModule* module) {
     // e.g.
     //
     //  f32[10] fusion(...), calls={...}
-    if (!ParseInstruciontRhs(&builder, module->name(), lexer_.GetLoc())) {
+    if (!ParseInstructionRhs(&builder, module->name(), lexer_.GetLoc())) {
       return false;
     }
   } else {
diff --git a/tensorflow/compiler/xla/service/hlo_parser.h b/tensorflow/compiler/xla/service/hlo_parser.h
index 81eeb9f13bf..d830fa61438 100644
--- a/tensorflow/compiler/xla/service/hlo_parser.h
+++ b/tensorflow/compiler/xla/service/hlo_parser.h
@@ -44,7 +44,9 @@ Status ParseHloString(absl::string_view str, HloModule* module);
 // creates a HloModule with default config.
 StatusOr<std::unique_ptr<HloModule>> ParseHloString(absl::string_view str);
 
-// Parses the result of HloSharding::ToString(), e.g. "{replicated}".
+// ParseHloString sharding from str. str is supposed to contain the body of the
+// sharding, i.e. just the rhs of the "sharding={...}" attribute string,
+// e.g., "{replicated}".
 StatusOr<HloSharding> ParseSharding(absl::string_view str);
 
 // Parses the result of window_util::ToString(const Window&).
@@ -55,10 +57,6 @@ StatusOr<Window> ParseWindow(absl::string_view str);
 StatusOr<ConvolutionDimensionNumbers> ParseConvolutionDimensionNumbers(
     absl::string_view str);
 
-// ParseHloString sharding from str. str is supposed to contain the body of the
-// sharding, i.e. just the rhs of the "sharding={...}" attribute string.
-StatusOr<HloSharding> ParseSharding(absl::string_view str);
-
 // Parses the result of PaddingConfigToString(), e.g. "0_0x1_1".
 StatusOr<PaddingConfig> ParsePaddingConfig(absl::string_view str);
 
diff --git a/tensorflow/compiler/xla/service/hlo_parser_test.cc b/tensorflow/compiler/xla/service/hlo_parser_test.cc
index eae6d19792f..c59bdc0a0b3 100644
--- a/tensorflow/compiler/xla/service/hlo_parser_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_parser_test.cc
@@ -1150,6 +1150,25 @@ ENTRY CrossReplicaSumWithSubgroups {
   ROOT cross-replica-sum = f32[128,32]{0,1} cross-replica-sum(input), replica_groups={{0,1},{2,3}}, barrier="abc", to_apply=add
 }
 
+)"
+},
+// cross-replica-sum with all-reduce-id
+{
+"CrossReplicaSumAllReduce",
+R"(HloModule CRS
+
+add {
+  lhs = f32[] parameter(0)
+  rhs = f32[] parameter(1)
+  ROOT add = f32[] add(lhs, rhs)
+}
+
+ENTRY CRS {
+  input = f32[8]{0} parameter(0)
+  crs.1 = f32[8]{0} cross-replica-sum(input), replica_groups={{0}}, all_reduce_id=1, to_apply=add
+  ROOT crs.0 = f32[8]{0} cross-replica-sum(input), replica_groups={{0}}, all_reduce_id=1, to_apply=add
+}
+
 )"
 },
 // all-to-all
diff --git a/tensorflow/compiler/xla/service/hlo_pass_pipeline_test.cc b/tensorflow/compiler/xla/service/hlo_pass_pipeline_test.cc
index ee8cb12b231..20384b9da6b 100644
--- a/tensorflow/compiler/xla/service/hlo_pass_pipeline_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_pass_pipeline_test.cc
@@ -19,14 +19,14 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_parser.h"
-#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 
 namespace xla {
 namespace {
 
-class HloPassPipelineTest : public HloVerifiedTestBase {
+class HloPassPipelineTest : public HloTestBase {
  protected:
   StatusOr<HloModuleGroup> ParseModuleGroup(
       absl::Span<const string> hlo_strings) {
diff --git a/tensorflow/compiler/xla/service/hlo_query.cc b/tensorflow/compiler/xla/service/hlo_query.cc
index 2d5197be9e6..f968a4a9445 100644
--- a/tensorflow/compiler/xla/service/hlo_query.cc
+++ b/tensorflow/compiler/xla/service/hlo_query.cc
@@ -104,5 +104,20 @@ bool IsScalarConstant(const HloInstruction* instruction) {
   return instruction->IsConstant() && ShapeUtil::IsScalar(instruction->shape());
 }
 
+bool ContainsInstrWithOpcode(const HloComputation* comp,
+                             const absl::flat_hash_set<HloOpcode>& opcodes) {
+  for (const auto* instr : comp->instructions()) {
+    if (opcodes.count(instr->opcode())) {
+      return true;
+    }
+    for (const HloComputation* subcomp : instr->called_computations()) {
+      if (ContainsInstrWithOpcode(subcomp, opcodes)) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
 }  // namespace hlo_query
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_query.h b/tensorflow/compiler/xla/service/hlo_query.h
index c0826a6aee1..215051f8834 100644
--- a/tensorflow/compiler/xla/service/hlo_query.h
+++ b/tensorflow/compiler/xla/service/hlo_query.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_QUERY_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_QUERY_H_
 
+#include "absl/container/flat_hash_set.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 
 namespace xla {
@@ -41,6 +43,12 @@ bool AllOperandsAreConstants(const HloInstruction& instruction);
 // Returns whether the instruction is a scalar constant.
 bool IsScalarConstant(const HloInstruction* instruction);
 
+// Determines whether the given computation contains an instruction with one of
+// the given opcodes.  Checks both comp's instructions and the instructions of
+// any computations nested within it.
+bool ContainsInstrWithOpcode(const HloComputation* comp,
+                             const absl::flat_hash_set<HloOpcode>& opcodes);
+
 // Returns an operand of an instruction with the given opcode. If there are
 // multiple matching operands, then the first matching operand is returned. If
 // there are no matching operands then nullptr is returned.
diff --git a/tensorflow/compiler/xla/service/hlo_reachability.cc b/tensorflow/compiler/xla/service/hlo_reachability.cc
index 961930f0a88..4aa80677524 100644
--- a/tensorflow/compiler/xla/service/hlo_reachability.cc
+++ b/tensorflow/compiler/xla/service/hlo_reachability.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <queue>
+
 #include "tensorflow/compiler/xla/service/hlo_reachability.h"
 
 namespace xla {
@@ -22,7 +24,7 @@ HloReachabilityMap::HloReachabilityMap(
     : size_(instructions.size()) {
   bit_vectors_.reserve(size_);
   for (const HloInstruction* hlo : instructions) {
-    indices_[hlo] = bit_vectors_.size();
+    indices_[GetKey(hlo)] = bit_vectors_.size();
     bit_vectors_.emplace_back(size_);
   }
   CHECK_EQ(size_, indices_.size());  // instructions should be unique
@@ -71,4 +73,70 @@ bool HloReachabilityMap::IsConnected(const HloInstruction* a,
   return IsReachable(a, b) || IsReachable(b, a);
 }
 
+std::unique_ptr<HloReachabilityMap> HloReachabilityMap::Build(
+    const HloComputation* computation) {
+  const auto& all = computation->MakeInstructionPostOrder();
+  auto result = absl::make_unique<HloReachabilityMap>(all);
+  auto channel_dependency_map = computation->ComputeChannelDependencies();
+
+  std::vector<HloInstruction*> inputs;
+  for (const HloInstruction* hlo : all) {
+    inputs.assign(hlo->operands().begin(), hlo->operands().end());
+    inputs.insert(inputs.end(), hlo->control_predecessors().begin(),
+                  hlo->control_predecessors().end());
+
+    switch (hlo->opcode()) {
+      case HloOpcode::kRecvDone: {
+        auto it = channel_dependency_map.find(hlo->channel_id());
+        if (it != channel_dependency_map.end()) {
+          absl::c_copy(it->second, std::back_inserter(inputs));
+        }
+        break;
+      }
+      case HloOpcode::kCrossReplicaSum: {
+        auto all_reduce_id = hlo->all_reduce_id();
+        if (all_reduce_id) {
+          auto it = channel_dependency_map.find(all_reduce_id.value());
+          if (it != channel_dependency_map.end()) {
+            absl::c_copy(it->second, std::back_inserter(inputs));
+          }
+        }
+        break;
+      }
+      default:
+        break;
+    }
+
+    result->FastSetReachabilityToUnion(inputs, hlo);
+  }
+  return result;
+}
+
+void HloReachabilityMap::UpdateReachabilityThroughInstruction(
+    const HloInstruction* instruction) {
+  std::queue<const HloInstruction*> worklist;
+  worklist.push(instruction);
+
+  std::vector<HloInstruction*> inputs;
+
+  while (!worklist.empty()) {
+    const HloInstruction* item = worklist.front();
+    worklist.pop();
+
+    inputs.assign(item->operands().begin(), item->operands().end());
+    inputs.insert(inputs.end(), item->control_predecessors().begin(),
+                  item->control_predecessors().end());
+
+    if (SetReachabilityToUnion(inputs, item)) {
+      // Add immediate successors to worklist.
+      for (const HloInstruction* user : item->users()) {
+        worklist.push(user);
+      }
+      for (const HloInstruction* succ : item->control_successors()) {
+        worklist.push(succ);
+      }
+    }
+  }
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_reachability.h b/tensorflow/compiler/xla/service/hlo_reachability.h
index 3e27d098aeb..7823b06a41b 100644
--- a/tensorflow/compiler/xla/service/hlo_reachability.h
+++ b/tensorflow/compiler/xla/service/hlo_reachability.h
@@ -16,27 +16,30 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_REACHABILITY_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_REACHABILITY_H_
 
+#include <cstdio>
 #include <list>
 #include <vector>
 
+#include "absl/base/casts.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/types/span.h"
 #include "tensorflow/compiler/xla/map_util.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace xla {
 
-class HloInstruction;
-
 // A class for representing reachability between HloInstructions.
 //
-// !!! THIS CLASS DOES NOT COMPUTE REACHABILITY !!! It has an adjacency matrix
-// and it is up to the user of the class to set the adjacency matrix such that
-// it represents reachability, i.e. such that it is transitive. That the graph
-// be transitive is thus not an invariant of this class, but it is required for
-// the name of the class and its methods to make sense.
+// It has an adjacency matrix and it is up to the user of the class to set the
+// adjacency matrix such that it represents reachability, i.e. such that it is
+// transitive. That the graph be transitive is thus not an invariant of this
+// class, but it is required for the name of the class and its methods to make
+// sense.
 class HloReachabilityMap {
  public:
   // Sets up a graph with no edges and where the nodes correspond to the given
@@ -44,6 +47,15 @@ class HloReachabilityMap {
   explicit HloReachabilityMap(
       absl::Span<const HloInstruction* const> instructions);
 
+  // Computes and returns the reachability between HLO instructions in the
+  // computation. The returned HloReachabilityMap is constructed such that
+  // HloReachabilityMap::IsReachable(a, b) returns true iff there exists a
+  // directed path (from producer to consumer) from 'a' to 'b'. Both data
+  // dependencies (operands) and control dependencies are considered for
+  // reachability. Trivially an instruction is reachable from itself.
+  static std::unique_ptr<HloReachabilityMap> Build(
+      const HloComputation* computation);
+
   // Set the reachability set of 'instruction' to the union of the reachability
   // sets of 'inputs'. Upon return, IsReachable(x, instruction) where
   // 'x' is not 'instruction' will return true iff IsReachable(x, input) is true
@@ -70,6 +82,10 @@ class HloReachabilityMap {
   // adjacency matrix.
   void SetReachable(const HloInstruction* a, const HloInstruction* b);
 
+  // Updates the given reachability map after the immediate predecessor set
+  // (operands and control predecessors) of 'instruction' has changed.
+  void UpdateReachabilityThroughInstruction(const HloInstruction* instruction);
+
   // Returns true if "b" is reachable from "a"
   //
   // Note that this function only correctly answers queries about reachability
@@ -83,7 +99,9 @@ class HloReachabilityMap {
   bool IsConnected(const HloInstruction* a, const HloInstruction* b) const;
 
   // Checks if an instruction is in the Reachability map.
-  bool IsPresent(const HloInstruction* a) const { return indices_.contains(a); }
+  bool IsPresent(const HloInstruction* a) const {
+    return indices_.contains(GetKey(a));
+  }
 
  private:
   // A bit-vector implementation specialized for this use case which provides a
@@ -146,18 +164,24 @@ class HloReachabilityMap {
       absl::Span<const HloInstruction* const> inputs,
       const HloInstruction* instruction, BitVector* bit_vector);
 
+  uint64 GetKey(const HloInstruction* instruction) const {
+    uint64 unique_id = absl::bit_cast<uint32>(instruction->unique_id());
+    uint64 module_id =
+        absl::bit_cast<uint32>(instruction->parent()->parent()->unique_id());
+    return (module_id << 32) | unique_id;
+  }
   // Return the index of the given instruction. The value is used to index into
   // the vector of BitVectors and the BitVectors themselves.
   int GetIndex(const HloInstruction* instruction) const {
-    return FindOrDie(indices_, instruction);
+    return FindOrDie(indices_, GetKey(instruction));
   }
 
   // The number of instructions in the reachability map.
   const size_t size_;
 
-  // Dense assignment from HloInstruction* to number. These numbers index
-  // into the bit_vectors_ vector and into the bits within a BitVector.
-  absl::flat_hash_map<const HloInstruction*, int> indices_;
+  // Dense assignment from HloInstruction::unique_id to number. These numbers
+  // index into the bit_vectors_ vector and into the bits within a BitVector.
+  absl::flat_hash_map<uint64, int> indices_;
 
   // Bitvectors holding the reachability to each instruction. The bit vector for
   // instruction X includes ones for each instruction which X is reachable from.
diff --git a/tensorflow/compiler/xla/service/hlo_reachability_test.cc b/tensorflow/compiler/xla/service/hlo_reachability_test.cc
index d9848cee0bf..59517670980 100644
--- a/tensorflow/compiler/xla/service/hlo_reachability_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_reachability_test.cc
@@ -20,13 +20,13 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
-#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 
 namespace xla {
 
 namespace {
 
-class HloReachabilityTest : public HloVerifiedTestBase {};
+class HloReachabilityTest : public HloTestBase {};
 
 TEST_F(HloReachabilityTest, Reachability) {
   // Construct and test a reachability graph of the following form:
@@ -48,7 +48,8 @@ TEST_F(HloReachabilityTest, Reachability) {
       HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(0.0f)));
   auto e = builder.AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(0.0f)));
-  builder.Build();
+  auto module = CreateNewVerifiedModule();
+  module->AddEntryComputation(builder.Build());
 
   HloReachabilityMap reachability({a, b, c, d, e});
   reachability.SetReachable(a, a);
@@ -81,6 +82,130 @@ TEST_F(HloReachabilityTest, Reachability) {
   EXPECT_FALSE(reachability.SetReachabilityToUnion({b, c}, d));
 }
 
+TEST_F(HloReachabilityTest, NonTrivialReachability) {
+  // Test reachability of a non-trivial computation:
+  //
+  // const1    const2
+  //    |         |
+  //    | +-------+
+  //    | |       |
+  //    add ..   negate
+  //     |   .     |
+  //     |   .... exp
+  //     |         |
+  //     +---+   +-+---+
+  //         |   |     |
+  //       multiply   copy
+  //
+  // There is a control dependency from 'add' to 'exp'.
+  Shape r0f32 = ShapeUtil::MakeShape(F32, {});
+  auto builder = HloComputation::Builder(TestName());
+  auto constant1 = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0f)));
+  auto constant2 = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(2.0f)));
+  auto add = builder.AddInstruction(HloInstruction::CreateBinary(
+      r0f32, HloOpcode::kAdd, constant1, constant2));
+  auto negate = builder.AddInstruction(
+      HloInstruction::CreateUnary(r0f32, HloOpcode::kNegate, constant2));
+  auto exp = builder.AddInstruction(
+      HloInstruction::CreateUnary(r0f32, HloOpcode::kExp, negate));
+  auto mul = builder.AddInstruction(
+      HloInstruction::CreateBinary(r0f32, HloOpcode::kMultiply, add, exp));
+  auto copy = builder.AddInstruction(
+      HloInstruction::CreateUnary(r0f32, HloOpcode::kCopy, exp));
+
+  auto module = CreateNewVerifiedModule();
+  auto computation =
+      module->AddEntryComputation(builder.Build(/*root_instruction=*/mul));
+
+  TF_CHECK_OK(add->AddControlDependencyTo(exp));
+  auto reachability = HloReachabilityMap::Build(computation);
+
+  EXPECT_TRUE(reachability->IsReachable(constant1, constant1));
+  EXPECT_FALSE(reachability->IsReachable(constant1, constant2));
+  EXPECT_TRUE(reachability->IsReachable(constant1, add));
+  EXPECT_FALSE(reachability->IsReachable(constant1, negate));
+  EXPECT_TRUE(reachability->IsReachable(constant1, exp));
+  EXPECT_TRUE(reachability->IsReachable(constant1, mul));
+  EXPECT_TRUE(reachability->IsReachable(constant1, copy));
+
+  EXPECT_FALSE(reachability->IsReachable(constant2, constant1));
+  EXPECT_TRUE(reachability->IsReachable(constant2, constant2));
+  EXPECT_TRUE(reachability->IsReachable(constant2, add));
+  EXPECT_TRUE(reachability->IsReachable(constant2, negate));
+  EXPECT_TRUE(reachability->IsReachable(constant2, exp));
+  EXPECT_TRUE(reachability->IsReachable(constant2, mul));
+  EXPECT_TRUE(reachability->IsReachable(constant2, copy));
+
+  EXPECT_FALSE(reachability->IsReachable(exp, constant1));
+  EXPECT_FALSE(reachability->IsReachable(exp, constant2));
+  EXPECT_FALSE(reachability->IsReachable(exp, add));
+  EXPECT_FALSE(reachability->IsReachable(exp, negate));
+  EXPECT_TRUE(reachability->IsReachable(exp, exp));
+  EXPECT_TRUE(reachability->IsReachable(exp, mul));
+  EXPECT_TRUE(reachability->IsReachable(exp, copy));
+
+  EXPECT_FALSE(reachability->IsReachable(mul, constant1));
+  EXPECT_FALSE(reachability->IsReachable(mul, constant2));
+  EXPECT_FALSE(reachability->IsReachable(mul, add));
+  EXPECT_FALSE(reachability->IsReachable(mul, negate));
+  EXPECT_FALSE(reachability->IsReachable(mul, exp));
+  EXPECT_TRUE(reachability->IsReachable(mul, mul));
+  EXPECT_FALSE(reachability->IsReachable(mul, copy));
+
+  EXPECT_TRUE(reachability->IsConnected(constant1, copy));
+  EXPECT_TRUE(reachability->IsConnected(copy, constant1));
+  EXPECT_FALSE(reachability->IsConnected(negate, add));
+  EXPECT_FALSE(reachability->IsConnected(add, negate));
+
+  // Remove the control dependency then update and verify the reachability map
+  ASSERT_IS_OK(add->RemoveControlDependencyTo(exp));
+  reachability->UpdateReachabilityThroughInstruction(exp);
+
+  EXPECT_TRUE(reachability->IsReachable(constant1, constant1));
+  EXPECT_FALSE(reachability->IsReachable(constant1, constant2));
+  EXPECT_TRUE(reachability->IsReachable(constant1, add));
+  EXPECT_FALSE(reachability->IsReachable(constant1, negate));
+  EXPECT_FALSE(reachability->IsReachable(constant1, exp));
+  EXPECT_TRUE(reachability->IsReachable(constant1, mul));
+  EXPECT_FALSE(reachability->IsReachable(constant1, copy));
+
+  // Change a use within the graph then update and verify the reachability map
+  ASSERT_IS_OK(constant2->ReplaceUseWith(negate, constant1));
+  reachability->UpdateReachabilityThroughInstruction(negate);
+
+  EXPECT_FALSE(reachability->IsReachable(constant2, constant1));
+  EXPECT_TRUE(reachability->IsReachable(constant2, constant2));
+  EXPECT_TRUE(reachability->IsReachable(constant2, add));
+  EXPECT_FALSE(reachability->IsReachable(constant2, negate));
+  EXPECT_FALSE(reachability->IsReachable(constant2, exp));
+  EXPECT_TRUE(reachability->IsReachable(constant2, mul));
+  EXPECT_FALSE(reachability->IsReachable(constant2, copy));
+}
+
+TEST_F(HloReachabilityTest, ChannelReachability) {
+  const Shape shape = ShapeUtil::MakeShape(F32, {5, 7});
+  HloComputation::Builder builder("ChannelReachability");
+  auto param = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, shape, "param"));
+  auto token0 = builder.AddInstruction(HloInstruction::CreateToken());
+  auto send =
+      builder.AddInstruction(HloInstruction::CreateSend(param, token0, 1));
+  auto send_done = builder.AddInstruction(HloInstruction::CreateSendDone(send));
+  auto token1 = builder.AddInstruction(HloInstruction::CreateToken());
+  auto recv =
+      builder.AddInstruction(HloInstruction::CreateRecv(shape, token1, 1));
+  auto recv_done = builder.AddInstruction(HloInstruction::CreateRecvDone(recv));
+
+  auto module = CreateNewVerifiedModule();
+  auto computation = module->AddEntryComputation(builder.Build(recv_done));
+  auto reachability = HloReachabilityMap::Build(computation);
+  EXPECT_TRUE(reachability->IsReachable(param, recv_done));
+  EXPECT_FALSE(reachability->IsReachable(send, recv));
+  EXPECT_FALSE(reachability->IsReachable(send_done, recv));
+}
+
 }  // namespace
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_rematerialization_test.cc b/tensorflow/compiler/xla/service/hlo_rematerialization_test.cc
index f7e82fb1f88..22c3c40a93a 100644
--- a/tensorflow/compiler/xla/service/hlo_rematerialization_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_rematerialization_test.cc
@@ -24,7 +24,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_ordering.h"
 #include "tensorflow/compiler/xla/shape_util.h"
-#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
@@ -36,7 +36,7 @@ namespace op = xla::testing::opcode_matchers;
 
 using ::testing::_;
 
-class HloRematerializationTest : public HloVerifiedTestBase {
+class HloRematerializationTest : public HloTestBase {
  protected:
   // Creates and returns a computation which can benefit from
   // rematerialization. The computation looks like:
@@ -162,7 +162,7 @@ class HloRematerializationTest : public HloVerifiedTestBase {
 // Test rematerialization of a single computation produced by
 // MakeRematerializableComputation.
 TEST_F(HloRematerializationTest, SingleComputation) {
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation* computation =
       module->AddEntryComputation(MakeRematerializableComputation());
 
@@ -177,7 +177,7 @@ TEST_F(HloRematerializationTest, SingleComputation) {
   // with rematerialization so pick a memory limit between these values (14KB).
   TF_ASSERT_OK_AND_ASSIGN(bool changed,
                           RunHloRematerialization(
-                              /*memory_limit_bytes=*/14 * 1024, module));
+                              /*memory_limit_bytes=*/14 * 1024, module.get()));
   EXPECT_TRUE(changed);
 
   // Root should not have changed.
@@ -203,7 +203,7 @@ TEST_F(HloRematerializationTest, SingleComputation) {
 // MakeRematerializableComputation but with a sufficiently high memory limit
 // such that no instructions are rematerialized.
 TEST_F(HloRematerializationTest, SingleComputationNoRematerialization) {
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation* computation =
       module->AddEntryComputation(MakeRematerializableComputation());
 
@@ -211,7 +211,7 @@ TEST_F(HloRematerializationTest, SingleComputationNoRematerialization) {
 
   TF_ASSERT_OK_AND_ASSIGN(bool changed,
                           RunHloRematerialization(
-                              /*memory_limit_bytes=*/20 * 1024, module));
+                              /*memory_limit_bytes=*/20 * 1024, module.get()));
 
   // No instructions should have been materialized.
   EXPECT_FALSE(changed);
@@ -225,7 +225,7 @@ TEST_F(HloRematerializationTest, SingleComputationNoRematerialization) {
 // computation should be the one chosen because rematerialization in the while
 // will presumably be more expensive.
 TEST_F(HloRematerializationTest, RematerializeAroundWhile) {
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
 
   auto cond_builder = HloComputation::Builder(TestName() + ".cond");
   cond_builder.AddInstruction(
@@ -249,7 +249,7 @@ TEST_F(HloRematerializationTest, RematerializeAroundWhile) {
   // bit lower (17KB) to force rematerialization of the entry computation.
   TF_ASSERT_OK_AND_ASSIGN(bool changed,
                           RunHloRematerialization(
-                              /*memory_limit_bytes=*/17 * 1024, module));
+                              /*memory_limit_bytes=*/17 * 1024, module.get()));
   EXPECT_TRUE(changed);
 
   // Only the entry computation should have a rematerialized instruction added.
@@ -261,7 +261,7 @@ TEST_F(HloRematerializationTest, RematerializeAroundWhile) {
 // while. Both the entry computation and while body computation should have
 // computations rematerialized.
 TEST_F(HloRematerializationTest, RematerializeEntryAndWhileBody) {
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
 
   auto cond_builder = HloComputation::Builder(TestName() + ".cond");
   cond_builder.AddInstruction(
@@ -282,7 +282,7 @@ TEST_F(HloRematerializationTest, RematerializeEntryAndWhileBody) {
 
   TF_ASSERT_OK_AND_ASSIGN(bool changed,
                           RunHloRematerialization(
-                              /*memory_limit_bytes=*/15 * 1024, module));
+                              /*memory_limit_bytes=*/15 * 1024, module.get()));
   EXPECT_TRUE(changed);
 
   // Both computations should have rematerialized instructions added.
@@ -293,7 +293,7 @@ TEST_F(HloRematerializationTest, RematerializeEntryAndWhileBody) {
 // Test rematerialization of a doubly nested computation. All computations
 // should have an instruction rematerialized.
 TEST_F(HloRematerializationTest, RematerializeNestedComputations) {
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
 
   auto cond_builder = HloComputation::Builder(TestName() + ".cond");
   cond_builder.AddInstruction(
@@ -321,7 +321,7 @@ TEST_F(HloRematerializationTest, RematerializeNestedComputations) {
   // ~12K so pick something slightly larger.
   TF_ASSERT_OK_AND_ASSIGN(bool changed,
                           RunHloRematerialization(
-                              /*memory_limit_bytes=*/13 * 1024, module));
+                              /*memory_limit_bytes=*/13 * 1024, module.get()));
   EXPECT_TRUE(changed);
 
   // All computations should have rematerialized instructions added.
@@ -346,7 +346,7 @@ TEST_F(HloRematerializationTest, RngNotRematerialized) {
   //
   //   F32[1024] add_2 = add(rng, add(tanh, add_1))  // LIVE: add_2 + add_1 +
   //                                                 //       rng + tanh + exp
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
 
   auto builder = HloComputation::Builder(TestName());
   auto param = builder.AddInstruction(
@@ -390,7 +390,7 @@ TEST_F(HloRematerializationTest, RngNotRematerialized) {
   TF_ASSERT_OK_AND_ASSIGN(
       bool changed,
       RunHloRematerialization(
-          /*memory_limit_bytes=*/4 * ByteSizeOf(vec1024_shape_), module));
+          /*memory_limit_bytes=*/4 * ByteSizeOf(vec1024_shape_), module.get()));
   EXPECT_TRUE(changed);
   // The rng should not have been rematerialized.
   EXPECT_EQ(count_rngs(entry_computation), 1);
@@ -420,7 +420,7 @@ TEST_F(HloRematerializationTest, InstructionRematerializedMultipleTimes) {
   // The value %bcast is live across each call of Subcomputation (which requires
   // 8KB) though the value is not used in the calls. Rematerializing %bcast
   // across these calls reduces peak memory use from ~20KB down to ~16KB.
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
 
   HloComputation* subcomputation = nullptr;
   {
@@ -482,7 +482,7 @@ TEST_F(HloRematerializationTest, InstructionRematerializedMultipleTimes) {
   // rematerialization).
   TF_ASSERT_OK_AND_ASSIGN(bool changed,
                           RunHloRematerialization(
-                              /*memory_limit_bytes=*/22 * 1024, module));
+                              /*memory_limit_bytes=*/22 * 1024, module.get()));
   EXPECT_TRUE(changed);
 
   // The broadcast should have been rematerialized 3 times.
@@ -533,7 +533,7 @@ TEST_P(IndirectUseTest, IndirectUseNotRematerialized) {
   // (ie %bcast is used indirectly by %negate), otherwise the %negate operand
   // aliases %add_2.
   const bool indirectly_used = GetParam();
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
 
   HloComputation* subcomputation = nullptr;
   {
@@ -576,7 +576,7 @@ TEST_P(IndirectUseTest, IndirectUseNotRematerialized) {
   // rematerialization).
   TF_ASSERT_OK_AND_ASSIGN(bool changed,
                           RunHloRematerialization(
-                              /*memory_limit_bytes=*/22 * 1024, module));
+                              /*memory_limit_bytes=*/22 * 1024, module.get()));
   // Rematerialization should only occur if the rematerializable instruction has
   // no indirect uses.
   if (indirectly_used) {
diff --git a/tensorflow/compiler/xla/service/hlo_subcomputation_unification_test.cc b/tensorflow/compiler/xla/service/hlo_subcomputation_unification_test.cc
index 45c684d6675..11994d99c93 100644
--- a/tensorflow/compiler/xla/service/hlo_subcomputation_unification_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_subcomputation_unification_test.cc
@@ -66,7 +66,7 @@ class HloSubcomputationUnificationTest : public HloTestBase {
 };
 
 TEST_F(HloSubcomputationUnificationTest, UnifyIdentities) {
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   auto builder = HloComputation::Builder(TestName());
 
   auto callee1 =
@@ -103,7 +103,7 @@ TEST_F(HloSubcomputationUnificationTest, UnifyIdentities) {
 }
 
 TEST_F(HloSubcomputationUnificationTest, UnifyAdditions) {
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   auto builder = HloComputation::Builder(TestName());
 
   auto callee1 =
@@ -143,7 +143,7 @@ TEST_F(HloSubcomputationUnificationTest, UnifyAdditions) {
 
 // Do not unify subcomputations with different parameter shapes.
 TEST_F(HloSubcomputationUnificationTest, DifferentParameterShapes) {
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   auto builder = HloComputation::Builder(TestName());
 
   auto callee1 =
@@ -184,7 +184,7 @@ TEST_F(HloSubcomputationUnificationTest, DifferentParameterShapes) {
 // Regression test for b/31466798. Checks that entry_computation is still valid
 // after unification.
 TEST_F(HloSubcomputationUnificationTest, TwoIdenticalComputations) {
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   for (int i = 0; i < 2; ++i) {
     HloComputation::Builder builder("pow");
     auto x =
diff --git a/tensorflow/compiler/xla/service/hlo_tfgraph_builder_test.cc b/tensorflow/compiler/xla/service/hlo_tfgraph_builder_test.cc
index 6fd734a2b9e..1e2b31a1f2b 100644
--- a/tensorflow/compiler/xla/service/hlo_tfgraph_builder_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_tfgraph_builder_test.cc
@@ -14,7 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/compiler/xla/service/hlo_tfgraph_builder.h"
-#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 
@@ -24,7 +24,7 @@ namespace {
 
 using ::tensorflow::GraphDef;
 
-class HloTfGraphBuilderTest : public HloVerifiedTestBase {
+class HloTfGraphBuilderTest : public HloTestBase {
  protected:
   HloTfGraphBuilderTest() {}
   HloTfGraphBuilder generator_;
diff --git a/tensorflow/compiler/xla/service/hlo_verifier.cc b/tensorflow/compiler/xla/service/hlo_verifier.cc
index 136824a3356..27fd685a69a 100644
--- a/tensorflow/compiler/xla/service/hlo_verifier.cc
+++ b/tensorflow/compiler/xla/service/hlo_verifier.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "absl/strings/str_join.h"
 #include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_instructions.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_verifier.h"
@@ -755,6 +756,12 @@ Status ShapeVerifier::HandleAfterAll(HloInstruction* token) {
   return CheckShape(token, ShapeInference::InferAfterAllShape(operand_shapes));
 }
 
+Status ShapeVerifier::HandleGetDimensionSize(HloInstruction* get_size) {
+  return CheckShape(
+      get_size, ShapeInference::InferGetDimensionSizeShape(
+                    get_size->operand(0)->shape(), get_size->dimensions(0)));
+}
+
 Status ShapeVerifier::CheckShape(const HloInstruction* instruction,
                                  const Shape& inferred_shape) {
   // If allow_mixed_precision_ is false, check if there are operands with
@@ -1331,6 +1338,15 @@ class InstructionVerifier : public DfsHloVisitorWithDefault {
     return Status::OK();
   }
 
+  Status HandleCrossReplicaSum(HloInstruction* crs) override {
+    if (crs->all_reduce_id().has_value()) {
+      TF_RET_CHECK(crs->all_reduce_id().value() > 0)
+          << "All reduce id must be greater than 0 for "
+          << crs->ToShortString();
+    }
+    return Status::OK();
+  }
+
   Status Preprocess(HloInstruction* instruction) override {
     auto previous = instructions_by_name_.find(instruction->name());
     TF_RET_CHECK(previous == instructions_by_name_.end())
diff --git a/tensorflow/compiler/xla/service/hlo_verifier.h b/tensorflow/compiler/xla/service/hlo_verifier.h
index 83b6244d1be..9fbfd6a21c1 100644
--- a/tensorflow/compiler/xla/service/hlo_verifier.h
+++ b/tensorflow/compiler/xla/service/hlo_verifier.h
@@ -94,6 +94,7 @@ class ShapeVerifier : public DfsHloVisitor {
   Status HandleGather(HloInstruction* gather) override;
   Status HandleScatter(HloInstruction* scatter) override;
   Status HandleAfterAll(HloInstruction* token) override;
+  Status HandleGetDimensionSize(HloInstruction* get_size) override;
 
   Status FinishVisit(HloInstruction*) override { return Status::OK(); }
 
diff --git a/tensorflow/compiler/xla/service/hlo_verifier_test.cc b/tensorflow/compiler/xla/service/hlo_verifier_test.cc
index afe01e5487c..5ddfe0a944f 100644
--- a/tensorflow/compiler/xla/service/hlo_verifier_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_verifier_test.cc
@@ -35,7 +35,7 @@ namespace {
 
 using ::testing::HasSubstr;
 
-// This class cannot be converted to use HloVerifiedTestBase. It explicitly
+// This class cannot be converted to use HloTestBase. It explicitly
 // uses HloTestBase to create and test malformed HLOs.
 class HloVerifierTest : public HloTestBase {
  public:
@@ -66,7 +66,7 @@ TEST_F(HloVerifierTest, NullInstructionParent) {
       HloInstruction::CreateParameter(0, scalar_shape, "param"));
   HloInstruction* negate = builder.AddInstruction(
       HloInstruction::CreateUnary(scalar_shape, HloOpcode::kNegate, param));
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   module->AddEntryComputation(builder.Build());
 
   TF_ASSERT_OK(verifier().Run(module.get()).status());
@@ -85,7 +85,7 @@ TEST_F(HloVerifierTest, NullComputationParent) {
       HloInstruction::CreateParameter(0, scalar_shape, "param"));
   builder.AddInstruction(
       HloInstruction::CreateUnary(scalar_shape, HloOpcode::kNegate, param));
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   HloComputation* computation = module->AddEntryComputation(builder.Build());
 
   TF_ASSERT_OK(verifier().Run(module.get()).status());
@@ -104,7 +104,7 @@ TEST_F(HloVerifierTest, DifferentOperandParents) {
       HloInstruction::CreateParameter(0, scalar_shape, "param"));
   HloInstruction* negate = builder.AddInstruction(
       HloInstruction::CreateUnary(scalar_shape, HloOpcode::kNegate, param));
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   module->AddEntryComputation(builder.Build());
 
   HloComputation::Builder emb_builder(TestName());
@@ -138,7 +138,7 @@ TEST_F(HloVerifierTest, ResetsShapeVerifierState) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(s2, HloOpcode::kMultiply, add, add));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   module->AddEntryComputation(builder.Build());
 
   // Run the verifier twice.  It should fail both times, because it shouldn't
@@ -303,7 +303,7 @@ TEST_F(HloVerifierTest, NegativeInteriorPaddingNotAllowed) {
           HloInstruction::CreateConstant(LiteralUtil::Zero(F32))),
       padding_config));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   module->AddEntryComputation(builder.Build());
 
   auto status = verifier().Run(module.get()).status();
@@ -327,7 +327,7 @@ TEST_F(HloVerifierTest, PadNegativeInteriorDilationNotAllowed) {
           HloInstruction::CreateConstant(LiteralUtil::Zero(F32).Clone())),
       padding_config));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   module->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(verifier().Run(module.get()).status().error_message(),
diff --git a/tensorflow/compiler/xla/service/human_readable_profile_builder.cc b/tensorflow/compiler/xla/service/human_readable_profile_builder.cc
index e103222b55f..90904ac0011 100644
--- a/tensorflow/compiler/xla/service/human_readable_profile_builder.cc
+++ b/tensorflow/compiler/xla/service/human_readable_profile_builder.cc
@@ -90,20 +90,29 @@ string HumanReadableProfileBuilder::ToString() const {
         op.optimal_seconds < 0
             ? ""
             : StrFormat("(%12.1f optimal)", op.optimal_seconds * 1e6),
-        op.flop_count <= 0 ? "" : HumanReadableNumFlops(op.flop_count, nsecs),
-        op.transcendental_count <= 0
-            ? ""
-            : HumanReadableNumTranscendentalOps(op.transcendental_count, nsecs),
+        op.flop_count > 0 && nsecs > 0
+            ? HumanReadableNumFlops(op.flop_count, nsecs)
+            : "",
+        op.transcendental_count > 0 && nsecs > 0
+            ? HumanReadableNumTranscendentalOps(op.transcendental_count, nsecs)
+            : "",
         bytes_per_sec, bytes_per_cycle, op.name);
   };
 
-  float optimal_seconds_sum = 0.0;
+  double optimal_seconds_sum = 0;
   int64 total_flops = 0.;
   int64 total_transcendentals = 0.;
   int64 total_bytes = 0;
   for (const auto& op : op_infos_) {
     if (op.optimal_seconds > 0) {
-      optimal_seconds_sum += op.optimal_seconds;
+      // An op can run faster than the estimated optimum. For example, we might
+      // estimate a fusion's speed by looking at the size of its operands and
+      // result, but perhaps the fusion doesn't read the entirety of all of its
+      // inputs.  For the purposes of summing the instructions' optimal speeds,
+      // we treat the "optimum" as the smallest of either the estimated optimum
+      // and the actual speed.
+      optimal_seconds_sum +=
+          std::min(double{op.optimal_seconds}, CyclesToSeconds(op.cycles));
     }
     total_flops += std::max(op.flop_count, int64{0});
     total_transcendentals += std::max(op.transcendental_count, int64{0});
@@ -114,7 +123,7 @@ string HumanReadableProfileBuilder::ToString() const {
 
   print_op({is_entry_computation_ ? "[total] [entry]" : "[total]", "[total]",
             /*category=*/"", total_cycles_, total_flops, total_transcendentals,
-            total_bytes, optimal_seconds_sum},
+            total_bytes, static_cast<float>(optimal_seconds_sum)},
            /*is_total=*/true);
 
   // Sort ops in decreasing order of cycles, and print them.
@@ -155,8 +164,10 @@ string HumanReadableProfileBuilder::ToString() const {
         entry.text = op.name;
         entry.short_text = op.short_name;
         entry.category_text = op.category;
-        entry.metric =
-            CyclesToMicroseconds(op.cycles) - op.optimal_seconds * 1e6;
+        // Ignore ops that run faster than the estimated optimal here, as we do
+        // when calculating optimal_seconds_sum.
+        entry.metric = std::max(
+            0., CyclesToMicroseconds(op.cycles) - op.optimal_seconds * 1e6);
         total_discrepancy_in_microseconds += entry.metric;
         table.AddEntry(std::move(entry));
       }
diff --git a/tensorflow/compiler/xla/service/implicit_broadcast_remover_test.cc b/tensorflow/compiler/xla/service/implicit_broadcast_remover_test.cc
index f85d31d5225..cf6cf897fe1 100644
--- a/tensorflow/compiler/xla/service/implicit_broadcast_remover_test.cc
+++ b/tensorflow/compiler/xla/service/implicit_broadcast_remover_test.cc
@@ -18,19 +18,20 @@ limitations under the License.
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
 #include "tensorflow/compiler/xla/shape_util.h"
-#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 
 namespace op = xla::testing::opcode_matchers;
 
 namespace xla {
 namespace {
 
-class ImplicitBroadcastRemoverTest : public HloVerifiedTestBase {
+class ImplicitBroadcastRemoverTest : public HloTestBase {
  protected:
   ImplicitBroadcastRemover remover_;
 };
 
 TEST_F(ImplicitBroadcastRemoverTest, NoImplicitBroadcast) {
+  auto m = CreateNewVerifiedModule();
   auto builder = HloComputation::Builder(TestName());
 
   const Shape shape = ShapeUtil::MakeShape(F32, {2, 4});
@@ -41,15 +42,16 @@ TEST_F(ImplicitBroadcastRemoverTest, NoImplicitBroadcast) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(shape, HloOpcode::kAdd, param0, param1));
 
-  HloComputation* computation = module().AddEntryComputation(builder.Build());
+  HloComputation* computation = m->AddEntryComputation(builder.Build());
 
-  EXPECT_FALSE(remover_.Run(&module()).ValueOrDie());
+  EXPECT_FALSE(remover_.Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Add(op::Parameter(), op::Parameter()));
 }
 
 TEST_F(ImplicitBroadcastRemoverTest, ScalarBroadcast) {
+  auto m = CreateNewVerifiedModule();
   auto builder = HloComputation::Builder(TestName());
 
   const Shape shape = ShapeUtil::MakeShape(F32, {2, 4});
@@ -60,13 +62,13 @@ TEST_F(ImplicitBroadcastRemoverTest, ScalarBroadcast) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(shape, HloOpcode::kPower, param0, param1));
 
-  HloComputation* computation = module().AddEntryComputation(builder.Build());
+  HloComputation* computation = m->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
 
   EXPECT_FALSE(ShapeUtil::Compatible(root->shape(), root->operand(0)->shape()));
   EXPECT_TRUE(ShapeUtil::Compatible(root->shape(), root->operand(1)->shape()));
 
-  EXPECT_TRUE(remover_.Run(&module()).ValueOrDie());
+  EXPECT_TRUE(remover_.Run(m.get()).ValueOrDie());
   root = computation->root_instruction();
 
   EXPECT_THAT(root, op::Power(op::Broadcast(op::Parameter()), op::Parameter()));
@@ -76,6 +78,7 @@ TEST_F(ImplicitBroadcastRemoverTest, ScalarBroadcast) {
 }
 
 TEST_F(ImplicitBroadcastRemoverTest, DegenerateDimensionBroadcast) {
+  auto m = CreateNewVerifiedModule();
   auto builder = HloComputation::Builder(TestName());
 
   const Shape shape = ShapeUtil::MakeShape(F32, {2, 4, 6});
@@ -86,9 +89,9 @@ TEST_F(ImplicitBroadcastRemoverTest, DegenerateDimensionBroadcast) {
   builder.AddInstruction(HloInstruction::CreateBinary(
       shape, HloOpcode::kSubtract, param0, param1));
 
-  HloComputation* computation = module().AddEntryComputation(builder.Build());
+  HloComputation* computation = m->AddEntryComputation(builder.Build());
 
-  EXPECT_TRUE(remover_.Run(&module()).ValueOrDie());
+  EXPECT_TRUE(remover_.Run(m.get()).ValueOrDie());
 
   HloInstruction* root = computation->root_instruction();
   EXPECT_THAT(root, op::Subtract(op::Parameter(),
@@ -98,6 +101,7 @@ TEST_F(ImplicitBroadcastRemoverTest, DegenerateDimensionBroadcast) {
 }
 
 TEST_F(ImplicitBroadcastRemoverTest, ScalarBroadcastToDegenerateDimensions) {
+  auto m = CreateNewVerifiedModule();
   auto builder = HloComputation::Builder(TestName());
 
   const Shape shape = ShapeUtil::MakeShape(F32, {1, 4, 1});
@@ -108,9 +112,9 @@ TEST_F(ImplicitBroadcastRemoverTest, ScalarBroadcastToDegenerateDimensions) {
   builder.AddInstruction(HloInstruction::CreateBinary(
       shape, HloOpcode::kSubtract, param0, param1));
 
-  HloComputation* computation = module().AddEntryComputation(builder.Build());
+  HloComputation* computation = m->AddEntryComputation(builder.Build());
 
-  EXPECT_TRUE(remover_.Run(&module()).ValueOrDie());
+  EXPECT_TRUE(remover_.Run(m.get()).ValueOrDie());
 
   HloInstruction* root = computation->root_instruction();
   EXPECT_THAT(root,
@@ -120,6 +124,7 @@ TEST_F(ImplicitBroadcastRemoverTest, ScalarBroadcastToDegenerateDimensions) {
 }
 
 TEST_F(ImplicitBroadcastRemoverTest, TernaryDegenerateDimensionBroadcast) {
+  auto m = CreateNewVerifiedModule();
   auto builder = HloComputation::Builder(TestName());
 
   const Shape shape = ShapeUtil::MakeShape(F32, {2, 4, 6, 8});
@@ -132,9 +137,9 @@ TEST_F(ImplicitBroadcastRemoverTest, TernaryDegenerateDimensionBroadcast) {
   builder.AddInstruction(HloInstruction::CreateTernary(shape, HloOpcode::kClamp,
                                                        param0, param1, param2));
 
-  HloComputation* computation = module().AddEntryComputation(builder.Build());
+  HloComputation* computation = m->AddEntryComputation(builder.Build());
 
-  EXPECT_TRUE(remover_.Run(&module()).ValueOrDie());
+  EXPECT_TRUE(remover_.Run(m.get()).ValueOrDie());
 
   HloInstruction* root = computation->root_instruction();
   EXPECT_THAT(root, op::Clamp(op::Broadcast(op::Reshape(op::Parameter())),
@@ -147,6 +152,7 @@ TEST_F(ImplicitBroadcastRemoverTest, TernaryDegenerateDimensionBroadcast) {
 
 TEST_F(ImplicitBroadcastRemoverTest,
        TernaryScalarAndDegenerateDimensionBroadcast) {
+  auto m = CreateNewVerifiedModule();
   auto builder = HloComputation::Builder(TestName());
 
   const Shape shape = ShapeUtil::MakeShape(F32, {2, 4, 6});
@@ -159,9 +165,9 @@ TEST_F(ImplicitBroadcastRemoverTest,
   builder.AddInstruction(HloInstruction::CreateTernary(shape, HloOpcode::kClamp,
                                                        param0, param1, param2));
 
-  HloComputation* computation = module().AddEntryComputation(builder.Build());
+  HloComputation* computation = m->AddEntryComputation(builder.Build());
 
-  EXPECT_TRUE(remover_.Run(&module()).ValueOrDie());
+  EXPECT_TRUE(remover_.Run(m.get()).ValueOrDie());
 
   HloInstruction* root = computation->root_instruction();
   EXPECT_THAT(root, op::Clamp(op::Broadcast(op::Parameter()),
diff --git a/tensorflow/compiler/xla/service/indexed_array_analysis_test.cc b/tensorflow/compiler/xla/service/indexed_array_analysis_test.cc
index 2d03aebc1ac..20cc18f9815 100644
--- a/tensorflow/compiler/xla/service/indexed_array_analysis_test.cc
+++ b/tensorflow/compiler/xla/service/indexed_array_analysis_test.cc
@@ -16,12 +16,12 @@ limitations under the License.
 #include <ctype.h>
 
 #include "tensorflow/compiler/xla/service/indexed_array_analysis.h"
-#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/tests/test_utils.h"
 
 namespace xla {
 namespace {
-class IndexedArrayAnalysisTest : public HloVerifiedTestBase {
+class IndexedArrayAnalysisTest : public HloTestBase {
  protected:
   void AssertArrayForRootExpressionIs(const string& hlo_text,
                                       const string& root_expression) {
@@ -61,12 +61,12 @@ class IndexedArrayAnalysisTest : public HloVerifiedTestBase {
                                           const string& root_expression,
                                           bool print_constants) {
     IndexedArrayAnalysis indexed_tensor_analysis;
-    ParseAndVerifyModule(hlo_text);
+    TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> m,
+                            ParseAndReturnVerifiedModule(hlo_text));
 
-    TF_ASSERT_OK_AND_ASSIGN(
-        IndexedArrayAnalysis::Array* const array_result,
-        indexed_tensor_analysis.GetArrayFor(
-            module().entry_computation()->root_instruction()));
+    TF_ASSERT_OK_AND_ASSIGN(IndexedArrayAnalysis::Array* const array_result,
+                            indexed_tensor_analysis.GetArrayFor(
+                                m->entry_computation()->root_instruction()));
     string string_result = CanonicalizeWhitespace(
         indexed_tensor_analysis.ToString(array_result, print_constants));
     LOG(INFO) << string_result;
diff --git a/tensorflow/compiler/xla/service/instruction_fusion.cc b/tensorflow/compiler/xla/service/instruction_fusion.cc
index a85793e4774..7f2d7e7cffc 100644
--- a/tensorflow/compiler/xla/service/instruction_fusion.cc
+++ b/tensorflow/compiler/xla/service/instruction_fusion.cc
@@ -155,6 +155,7 @@ bool IsAlwaysDuplicable(const HloInstruction& instruction) {
     case HloOpcode::kTanh:
     case HloOpcode::kTrace:
     case HloOpcode::kWhile:
+    case HloOpcode::kGetDimensionSize:
       return true;
   }
 
@@ -452,7 +453,7 @@ StatusOr<bool> InstructionFusion::Run(HloModule* module) {
   for (auto* computation : module->MakeNonfusionComputations()) {
     CHECK(!computation->IsFusionComputation());
     computation_ = computation;
-    reachability_ = computation_->ComputeReachability();
+    reachability_ = HloReachabilityMap::Build(computation_);
 
     HloInstructionSet do_not_duplicate =
         ComputeGloballyUnfusible(computation_->MakeInstructionPostOrder());
@@ -566,7 +567,7 @@ bool InstructionFusion::MultiOutputFusionCreatesCycle(
     // A consumer operand may have been multii-output fused into a parallel
     // consumer and thus be missing  from the oridinal reachability map.
     if (!reachability_->IsPresent(a) || !reachability_->IsPresent(b)) {
-      reachability_ = consumer->parent()->ComputeReachability();
+      reachability_ = HloReachabilityMap::Build(consumer->parent());
     }
     return reachability_->IsReachable(a, b);
   };
diff --git a/tensorflow/compiler/xla/service/instruction_fusion.h b/tensorflow/compiler/xla/service/instruction_fusion.h
index 4045e886dd9..198bd7fce5f 100644
--- a/tensorflow/compiler/xla/service/instruction_fusion.h
+++ b/tensorflow/compiler/xla/service/instruction_fusion.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+#include "tensorflow/compiler/xla/service/hlo_reachability.h"
 #include "tensorflow/core/platform/macros.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/instruction_fusion_test.cc b/tensorflow/compiler/xla/service/instruction_fusion_test.cc
index da1ad90959d..39904bd54b0 100644
--- a/tensorflow/compiler/xla/service/instruction_fusion_test.cc
+++ b/tensorflow/compiler/xla/service/instruction_fusion_test.cc
@@ -117,7 +117,7 @@ TEST_F(InstructionFusionTest, PotentialBitcastReshapeOfParameterUnfused) {
   auto reshape1 = builder.AddInstruction(
       HloInstruction::CreateReshape(ShapeUtil::MakeShape(S32, {1, 1}), param0));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
   EXPECT_EQ(reshape1, computation->root_instruction());
   EXPECT_FALSE(
@@ -133,7 +133,7 @@ TEST_F(InstructionFusionTest, PotentialBitcastSimpleReshapeOfParameterUnfused) {
   auto reshape1 = builder.AddInstruction(
       HloInstruction::CreateReshape(ShapeUtil::MakeShape(S32, {1, 1}), param0));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
   EXPECT_EQ(reshape1, computation->root_instruction());
   EXPECT_FALSE(
@@ -149,7 +149,7 @@ TEST_F(InstructionFusionTest, PotentialBitcastTransposeOfParameterUnfused) {
   auto transpose1 = builder.AddInstruction(HloInstruction::CreateTranspose(
       ShapeUtil::MakeShape(S32, {}), param0, {}));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
   EXPECT_EQ(transpose1, computation->root_instruction());
   EXPECT_FALSE(
@@ -172,7 +172,7 @@ TEST_F(InstructionFusionTest, AvoidDuplicationIfNotAllFusible) {
   HloInstruction* unary = builder.AddInstruction(
       HloInstruction::CreateUnary(shape, HloOpcode::kAbs, binary1));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
   EXPECT_EQ(unary, computation->root_instruction());
   EXPECT_FALSE(
@@ -361,7 +361,7 @@ TEST_F(InstructionFusionTest, AllowUnaryDuplication) {
   HloInstruction* unary2 = builder.AddInstruction(
       HloInstruction::CreateUnary(shape, HloOpcode::kAbs, unary1));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
   EXPECT_EQ(unary2, computation->root_instruction());
   EXPECT_TRUE(
@@ -385,7 +385,7 @@ TEST_F(InstructionFusionTest, AllowEffectiveUnaryDuplication) {
   HloInstruction* unary = builder.AddInstruction(
       HloInstruction::CreateUnary(shape, HloOpcode::kAbs, binary1));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
   EXPECT_EQ(unary, computation->root_instruction());
   EXPECT_TRUE(
diff --git a/tensorflow/compiler/xla/service/interpreter/platform.cc b/tensorflow/compiler/xla/service/interpreter/platform.cc
index c9b40d3c619..b0fc1af8b89 100644
--- a/tensorflow/compiler/xla/service/interpreter/platform.cc
+++ b/tensorflow/compiler/xla/service/interpreter/platform.cc
@@ -110,3 +110,5 @@ REGISTER_MODULE_INITIALIZER(
 // open-source project, so this will be a no-op there.
 REGISTER_MODULE_INITIALIZER_SEQUENCE(interpreter_platform,
                                      multi_platform_manager);
+REGISTER_MODULE_INITIALIZER_SEQUENCE(multi_platform_manager_listener,
+                                     interpreter_platform);
diff --git a/tensorflow/compiler/xla/service/layout_assignment.cc b/tensorflow/compiler/xla/service/layout_assignment.cc
index 6b033946698..a9041192220 100644
--- a/tensorflow/compiler/xla/service/layout_assignment.cc
+++ b/tensorflow/compiler/xla/service/layout_assignment.cc
@@ -2092,6 +2092,7 @@ bool LayoutAssignment::InstructionCanChangeLayout(
     case HloOpcode::kTrace:
     case HloOpcode::kTranspose:
     case HloOpcode::kTuple:
+    case HloOpcode::kGetDimensionSize:
       return true;
   }
 }
diff --git a/tensorflow/compiler/xla/service/layout_assignment_test.cc b/tensorflow/compiler/xla/service/layout_assignment_test.cc
index 47bfca2fd6e..2400b7bb7c4 100644
--- a/tensorflow/compiler/xla/service/layout_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/layout_assignment_test.cc
@@ -35,7 +35,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
-#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/tests/test_utils.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
@@ -49,20 +49,19 @@ namespace {
 
 using ::testing::ElementsAre;
 
-class LayoutAssignmentTest : public HloVerifiedTestBase {
+class LayoutAssignmentTest : public HloTestBase {
  protected:
-  void AssignLayouts(HloModule* module,
-                     ComputationLayout* entry_computation_layout,
+  void AssignLayouts(HloModule* m, ComputationLayout* entry_computation_layout,
                      ChannelLayoutConstraints* channel_constraints = nullptr) {
     LayoutAssignment layout_assignment(
         entry_computation_layout, LayoutAssignment::InstructionCanChangeLayout,
         /*channel_constraints=*/channel_constraints);
-    EXPECT_IS_OK(layout_assignment.Run(module).status());
+    EXPECT_IS_OK(layout_assignment.Run(m).status());
   }
 
-  std::vector<int64> LayoutOf(HloModule* module, absl::string_view name) {
+  std::vector<int64> LayoutOf(HloModule* m, absl::string_view name) {
     auto minor_to_major =
-        FindInstruction(module, name)->shape().layout().minor_to_major();
+        FindInstruction(m, name)->shape().layout().minor_to_major();
     return std::vector<int64>(minor_to_major.begin(), minor_to_major.end());
   }
 
@@ -91,7 +90,7 @@ class LayoutAssignmentTest : public HloVerifiedTestBase {
 TEST_F(LayoutAssignmentTest, ComputationLayout) {
   // Verify the layouts of the root and parameter instructions of a computation
   // match the ComputationLayout for two different layouts.
-  std::vector<std::initializer_list<int64>> minor_to_majors = {{0, 1}, {1, 0}};
+  std::vector<std::vector<int64>> minor_to_majors = {{0, 1}, {1, 0}};
   for (auto& minor_to_major : minor_to_majors) {
     auto builder = HloComputation::Builder(TestName());
     Shape ashape = ShapeUtil::MakeShape(F32, {42, 12});
@@ -101,8 +100,8 @@ TEST_F(LayoutAssignmentTest, ComputationLayout) {
         HloInstruction::CreateParameter(1, ashape, "param1"));
     auto add = builder.AddInstruction(
         HloInstruction::CreateBinary(ashape, HloOpcode::kAdd, param0, param1));
-    auto module = CreateNewModule();
-    HloComputation* computation = module->AddEntryComputation(builder.Build());
+    auto m = CreateNewVerifiedModule();
+    HloComputation* computation = m->AddEntryComputation(builder.Build());
 
     Layout layout = LayoutUtil::MakeLayout(minor_to_major);
     Shape shape(ashape);
@@ -113,7 +112,7 @@ TEST_F(LayoutAssignmentTest, ComputationLayout) {
     *computation_layout.mutable_parameter_layout(0) = shape_layout;
     *computation_layout.mutable_parameter_layout(1) = shape_layout;
     *computation_layout.mutable_result_layout() = shape_layout;
-    AssignLayouts(module, &computation_layout);
+    AssignLayouts(m.get(), &computation_layout);
     EXPECT_TRUE(LayoutUtil::Equal(layout, param0->shape().layout()));
     EXPECT_TRUE(LayoutUtil::Equal(layout, param1->shape().layout()));
     EXPECT_TRUE(LayoutUtil::Equal(layout, add->shape().layout()));
@@ -131,8 +130,8 @@ TEST_F(LayoutAssignmentTest, ComputationLayoutMixedLayout) {
       HloInstruction::CreateParameter(1, ashape, "param1"));
   builder.AddInstruction(
       HloInstruction::CreateBinary(ashape, HloOpcode::kAdd, param0, param1));
-  auto module = CreateNewModule();
-  HloComputation* computation = module->AddEntryComputation(builder.Build());
+  auto m = CreateNewVerifiedModule();
+  HloComputation* computation = m->AddEntryComputation(builder.Build());
 
   Layout col_major_layout = LayoutUtil::MakeLayout({1, 0});
   Shape col_major_shape(ashape);
@@ -149,7 +148,7 @@ TEST_F(LayoutAssignmentTest, ComputationLayoutMixedLayout) {
   *computation_layout.mutable_parameter_layout(1) = row_major;
   *computation_layout.mutable_result_layout() = col_major;
 
-  AssignLayouts(module, &computation_layout);
+  AssignLayouts(m.get(), &computation_layout);
   EXPECT_TRUE(LayoutUtil::Equal(col_major_layout, param0->shape().layout()));
   EXPECT_TRUE(LayoutUtil::Equal(row_major_layout, param1->shape().layout()));
   EXPECT_TRUE(LayoutUtil::Equal(
@@ -160,7 +159,7 @@ TEST_F(LayoutAssignmentTest, FusionInstruction) {
   // Verify that the layout of the fused parameters in a fusion instruction
   // match that of the fusion operands. Other fused instructions should have no
   // layout.
-  std::vector<std::initializer_list<int64>> minor_to_majors = {{0, 1}, {1, 0}};
+  std::vector<std::vector<int64>> minor_to_majors = {{0, 1}, {1, 0}};
   for (auto& minor_to_major : minor_to_majors) {
     auto builder = HloComputation::Builder(TestName());
     auto constant_literal1 = LiteralUtil::CreateR2WithLayout<float>(
@@ -180,8 +179,8 @@ TEST_F(LayoutAssignmentTest, FusionInstruction) {
     auto negate2 = builder.AddInstruction(
         HloInstruction::CreateUnary(ashape, HloOpcode::kNegate, negate1));
 
-    auto module = CreateNewModule();
-    HloComputation* computation = module->AddEntryComputation(builder.Build());
+    auto m = CreateNewVerifiedModule();
+    HloComputation* computation = m->AddEntryComputation(builder.Build());
 
     auto fusion = computation->CreateFusionInstruction(
         {negate2, negate1, add}, HloInstruction::FusionKind::kLoop);
@@ -194,7 +193,7 @@ TEST_F(LayoutAssignmentTest, FusionInstruction) {
     ComputationLayout computation_layout(computation->ComputeProgramShape());
     *computation_layout.mutable_result_layout() = shape_layout;
 
-    AssignLayouts(module, &computation_layout);
+    AssignLayouts(m.get(), &computation_layout);
 
     EXPECT_TRUE(LayoutUtil::Equal(
         layout, fusion->fused_parameter(0)->shape().layout()));
@@ -229,13 +228,13 @@ TEST_F(LayoutAssignmentTest, TupleLayout) {
   auto negate = builder.AddInstruction(HloInstruction::CreateUnary(
       constant0->shape(), HloOpcode::kNegate, get_element0));
 
-  auto module = CreateNewModule();
-  module->AddEntryComputation(builder.Build());
+  auto m = CreateNewVerifiedModule();
+  m->AddEntryComputation(builder.Build());
 
   ComputationLayout computation_layout(
-      module->entry_computation()->ComputeProgramShape());
+      m->entry_computation()->ComputeProgramShape());
 
-  AssignLayouts(module, &computation_layout);
+  AssignLayouts(m.get(), &computation_layout);
 
   EXPECT_TRUE(
       LayoutUtil::LayoutsInShapesEqual(constant0->shape(), constant1->shape()));
@@ -267,17 +266,17 @@ TEST_F(LayoutAssignmentTest, TupleSelect) {
   auto select = builder.AddInstruction(HloInstruction::CreateTernary(
       tuple0->shape(), HloOpcode::kTupleSelect, pred, tuple0, tuple1));
 
-  auto module = CreateNewModule();
-  module->AddEntryComputation(builder.Build());
+  auto m = CreateNewVerifiedModule();
+  m->AddEntryComputation(builder.Build());
 
   ComputationLayout computation_layout(
-      module->entry_computation()->ComputeProgramShape());
+      m->entry_computation()->ComputeProgramShape());
   Shape result_shape =
       ShapeUtil::MakeTupleShape({constant0->shape(), constant1->shape()});
   TF_CHECK_OK(computation_layout.mutable_result_layout()->CopyLayoutFromShape(
       result_shape));
 
-  AssignLayouts(module, &computation_layout);
+  AssignLayouts(m.get(), &computation_layout);
 
   EXPECT_TRUE(LayoutUtil::LayoutsInShapesEqual(result_shape, select->shape()));
 }
@@ -302,11 +301,11 @@ TEST_F(LayoutAssignmentTest, ConflictingLayoutTuple) {
   auto nested_tuple = builder.AddInstruction(
       HloInstruction::CreateTuple({inner_tuple, inner_tuple}));
 
-  auto module = CreateNewModule();
-  module->AddEntryComputation(builder.Build());
+  auto m = CreateNewVerifiedModule();
+  m->AddEntryComputation(builder.Build());
 
   ComputationLayout computation_layout(
-      module->entry_computation()->ComputeProgramShape());
+      m->entry_computation()->ComputeProgramShape());
   Shape result_shape = nested_tuple->shape();
   *ShapeUtil::GetMutableSubshape(&result_shape, /*index=*/{0, 0}) =
       ShapeUtil::MakeShapeWithLayout(F32, {2, 2}, {1, 0});
@@ -316,7 +315,7 @@ TEST_F(LayoutAssignmentTest, ConflictingLayoutTuple) {
       result_shape));
 
   LayoutAssignment layout_assignment(&computation_layout);
-  AssignLayouts(module, &computation_layout);
+  AssignLayouts(m.get(), &computation_layout);
 
   // Layout assignment should have deep copied the result of the computation to
   // address the layout conflict. This results in several Tuple() and
@@ -332,9 +331,9 @@ TEST_F(LayoutAssignmentTest, ConflictingLayoutTuple) {
   EXPECT_TRUE(
       AlgebraicSimplifier(/*is_layout_sensitive=*/true,
                           [](const Shape&, const Shape&) { return false; })
-          .Run(module)
+          .Run(m.get())
           .ValueOrDie());
-  HloInstruction* root = module->entry_computation()->root_instruction();
+  HloInstruction* root = m->entry_computation()->root_instruction();
   // Verify layout of the root and the root's operands.
   EXPECT_TRUE(ShapeUtil::Equal(result_shape, root->shape()));
   EXPECT_TRUE(ShapeUtil::Equal(ShapeUtil::GetSubshape(result_shape, {0}),
@@ -361,9 +360,8 @@ TEST_F(LayoutAssignmentTest, ElementwiseAndReshape) {
   auto tanh = builder.AddInstruction(
       HloInstruction::CreateUnary(bshape, HloOpcode::kTanh, reshape));
 
-  auto module = CreateNewModule();
-  HloComputation* computation =
-      module->AddEntryComputation(builder.Build(tanh));
+  auto m = CreateNewVerifiedModule();
+  HloComputation* computation = m->AddEntryComputation(builder.Build(tanh));
 
   Shape ashape_with_layout(ashape);
   Shape bshape_with_layout(bshape);
@@ -374,7 +372,7 @@ TEST_F(LayoutAssignmentTest, ElementwiseAndReshape) {
   *computation_layout.mutable_parameter_layout(0) =
       ShapeLayout(ashape_with_layout);
   *computation_layout.mutable_result_layout() = ShapeLayout(bshape_with_layout);
-  AssignLayouts(module, &computation_layout);
+  AssignLayouts(m.get(), &computation_layout);
 
   auto log_minor_to_major =
       AsInt64Slice(log->shape().layout().minor_to_major());
@@ -403,8 +401,8 @@ TEST_F(LayoutAssignmentTest, ElementwiseAndTranspose) {
       HloInstruction::CreateTranspose(bshape, log, {1, 0}));
   auto tanh = builder.AddInstruction(
       HloInstruction::CreateUnary(bshape, HloOpcode::kTanh, transpose));
-  auto module = CreateNewModule();
-  auto computation = module->AddEntryComputation(builder.Build(tanh));
+  auto m = CreateNewVerifiedModule();
+  auto computation = m->AddEntryComputation(builder.Build(tanh));
 
   Shape ashape_with_layout(ashape);
   Shape bshape_with_layout(bshape);
@@ -415,7 +413,7 @@ TEST_F(LayoutAssignmentTest, ElementwiseAndTranspose) {
   *computation_layout.mutable_parameter_layout(0) =
       ShapeLayout(ashape_with_layout);
   *computation_layout.mutable_result_layout() = ShapeLayout(bshape_with_layout);
-  AssignLayouts(module, &computation_layout);
+  AssignLayouts(m.get(), &computation_layout);
 
   EXPECT_TRUE(
       LayoutUtil::Equal(ashape_with_layout.layout(), log->shape().layout()));
@@ -439,9 +437,9 @@ TEST_F(LayoutAssignmentTest, BroadcastAndTranspose) {
       HloInstruction::CreateBroadcast(bshape, param, {1, 2}));
   auto transpose = builder.AddInstruction(
       HloInstruction::CreateTranspose(cshape, broadcast, {2, 1, 0}));
-  auto module = CreateNewModule();
+  auto m = CreateNewVerifiedModule();
   HloComputation* computation =
-      module->AddEntryComputation(builder.Build(transpose));
+      m->AddEntryComputation(builder.Build(transpose));
 
   Shape input_shape_with_layout(ashape);
   Shape output_shape_with_layout(cshape);
@@ -454,7 +452,7 @@ TEST_F(LayoutAssignmentTest, BroadcastAndTranspose) {
       ShapeLayout(input_shape_with_layout);
   *computation_layout.mutable_result_layout() =
       ShapeLayout(output_shape_with_layout);
-  AssignLayouts(module, &computation_layout);
+  AssignLayouts(m.get(), &computation_layout);
 
   EXPECT_THAT(broadcast->shape().layout().minor_to_major(),
               ElementsAre(0, 1, 2));
@@ -488,9 +486,8 @@ TEST_F(LayoutAssignmentTest, ReshapeOperandHasMultipleUsers) {
       HloInstruction::CreateBroadcast(f32_234, tanh, {1, 2}));
   auto tuple = builder.AddInstruction(
       HloInstruction::CreateTuple({transpose, broadcast2}));
-  auto module = CreateNewModule();
-  HloComputation* computation =
-      module->AddEntryComputation(builder.Build(tuple));
+  auto m = CreateNewVerifiedModule();
+  HloComputation* computation = m->AddEntryComputation(builder.Build(tuple));
 
   ComputationLayout computation_layout(computation->ComputeProgramShape());
   Shape param_shape_with_layout(f32_4);
@@ -507,7 +504,7 @@ TEST_F(LayoutAssignmentTest, ReshapeOperandHasMultipleUsers) {
   *computation_layout.mutable_result_layout() =
       ShapeLayout(ShapeUtil::MakeTupleShape(
           {transpose_shape_with_layout, broadcast2_shape_with_layout}));
-  AssignLayouts(module, &computation_layout);
+  AssignLayouts(m.get(), &computation_layout);
 
   EXPECT_THAT(broadcast->shape().layout().minor_to_major(), ElementsAre(0, 1));
   EXPECT_THAT(transpose->shape().layout().minor_to_major(), ElementsAre(1, 0));
@@ -558,9 +555,8 @@ TEST_F(LayoutAssignmentTest, MakeOperandsTheSame) {
       HloInstruction::CreateConcatenate(bshape, {param0, param1}, 1));
   auto reshape = builder.AddInstruction(
       HloInstruction::CreateReshape(cshape, concatenate));
-  auto module = CreateNewModule();
-  HloComputation* computation =
-      module->AddEntryComputation(builder.Build(reshape));
+  auto m = CreateNewVerifiedModule();
+  HloComputation* computation = m->AddEntryComputation(builder.Build(reshape));
 
   Shape param0_shape_with_layout(ashape);
   Shape param1_shape_with_layout(ashape);
@@ -573,7 +569,7 @@ TEST_F(LayoutAssignmentTest, MakeOperandsTheSame) {
   *computation_layout.mutable_parameter_layout(1) =
       ShapeLayout(param1_shape_with_layout);
   OperandsMustBeTheSameLayoutAssignment layout_assignment(&computation_layout);
-  EXPECT_IS_OK(layout_assignment.Run(module).status());
+  EXPECT_IS_OK(layout_assignment.Run(m.get()).status());
 
   EXPECT_EQ(HloOpcode::kCopy, concatenate->operand(0)->opcode());
   EXPECT_THAT(concatenate->operand(0)->shape().layout().minor_to_major(),
@@ -593,11 +589,11 @@ TEST_F(LayoutAssignmentTest, TransposeToBitcastFromOperand) {
       HloInstruction::CreateParameter(0, input_shape_with_layout, "param"));
   auto transpose = builder.AddInstruction(HloInstruction::CreateTranspose(
       ShapeUtil::MakeShape(F32, {6, 7, 3, 5}), param, {2, 3, 0, 1}));
-  auto module = CreateNewModule();
+  auto m = CreateNewVerifiedModule();
   HloComputation* computation =
-      module->AddEntryComputation(builder.Build(transpose));
+      m->AddEntryComputation(builder.Build(transpose));
   ComputationLayout computation_layout(computation->ComputeProgramShape());
-  AssignLayouts(module, &computation_layout);
+  AssignLayouts(m.get(), &computation_layout);
   EXPECT_TRUE(ShapeUtil::TransposeIsBitcast(transpose->operand(0)->shape(),
                                             transpose->shape(), {2, 3, 0, 1}));
 }
@@ -611,11 +607,11 @@ TEST_F(LayoutAssignmentTest, TransposeToBitcastToUser) {
       HloInstruction::CreateBroadcast(input_shape, constant, {}));
   auto transpose = builder.AddInstruction(HloInstruction::CreateTranspose(
       ShapeUtil::MakeShape(F32, {6, 7, 3, 5}), broadcast, {2, 3, 0, 1}));
-  auto module = CreateNewModule();
+  auto m = CreateNewVerifiedModule();
   HloComputation* computation =
-      module->AddEntryComputation(builder.Build(transpose));
+      m->AddEntryComputation(builder.Build(transpose));
   ComputationLayout computation_layout(computation->ComputeProgramShape());
-  AssignLayouts(module, &computation_layout);
+  AssignLayouts(m.get(), &computation_layout);
   EXPECT_TRUE(ShapeUtil::TransposeIsBitcast(transpose->operand(0)->shape(),
                                             transpose->shape(), {2, 3, 0, 1}));
 }
@@ -681,12 +677,12 @@ TEST_F(LayoutAssignmentTest, TransposeWithinFusionDoesNotCrash) {
     }
   )";
 
-  ParseAndVerifyModule(module_str);
-
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> m,
+                          ParseAndReturnVerifiedModule(module_str));
   std::unique_ptr<HloModule> compiled_module =
       backend()
           .compiler()
-          ->RunHloPasses(module().Clone(), backend().default_stream_executor(),
+          ->RunHloPasses(m->Clone(), backend().default_stream_executor(),
                          /*device_allocator=*/nullptr)
           .ConsumeValueOrDie();
 
@@ -721,9 +717,10 @@ TEST_F(LayoutAssignmentTest, GTEInheritsLayoutFromOperand) {
     }
   )";
 
-  ParseAndVerifyModule(module_str);
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> m,
+                          ParseAndReturnVerifiedModule(module_str));
   ComputationLayout computation_layout(
-      module().entry_computation()->ComputeProgramShape());
+      m->entry_computation()->ComputeProgramShape());
   Shape param_shape = ShapeUtil::MakeTupleShape(
       {ShapeUtil::MakeShapeWithLayout(F32, {2, 2, 2}, {0, 1, 2}),
        ShapeUtil::MakeTupleShape({
@@ -735,19 +732,19 @@ TEST_F(LayoutAssignmentTest, GTEInheritsLayoutFromOperand) {
           param_shape));
   computation_layout.mutable_result_layout()->ResetLayout(
       LayoutUtil::MakeLayout({2, 1, 0}));
-  AssignLayouts(&module(), &computation_layout);
+  AssignLayouts(m.get(), &computation_layout);
 
-  EXPECT_THAT(LayoutOf(&module(), "gte0"), ElementsAre(0, 1, 2));
-  EXPECT_THAT(LayoutOf(&module(), "gte1a"), ElementsAre(1, 2, 0));
-  EXPECT_THAT(LayoutOf(&module(), "gte1b"), ElementsAre(2, 0, 1));
-  EXPECT_THAT(LayoutOf(&module(), "fresult"), ElementsAre(2, 1, 0));
-  EXPECT_THAT(FindInstruction(&module(), "gte1")
+  EXPECT_THAT(LayoutOf(m.get(), "gte0"), ElementsAre(0, 1, 2));
+  EXPECT_THAT(LayoutOf(m.get(), "gte1a"), ElementsAre(1, 2, 0));
+  EXPECT_THAT(LayoutOf(m.get(), "gte1b"), ElementsAre(2, 0, 1));
+  EXPECT_THAT(LayoutOf(m.get(), "fresult"), ElementsAre(2, 1, 0));
+  EXPECT_THAT(FindInstruction(m.get(), "gte1")
                   ->shape()
                   .tuple_shapes(0)
                   .layout()
                   .minor_to_major(),
               ElementsAre(1, 2, 0));
-  EXPECT_THAT(FindInstruction(&module(), "gte1")
+  EXPECT_THAT(FindInstruction(m.get(), "gte1")
                   ->shape()
                   .tuple_shapes(1)
                   .layout()
@@ -757,7 +754,7 @@ TEST_F(LayoutAssignmentTest, GTEInheritsLayoutFromOperand) {
 
 TEST_F(LayoutAssignmentTest, ConditionalAsymmetricLayout) {
   auto builder = HloComputation::Builder(TestName());
-  auto module = CreateNewModule();
+  auto m = CreateNewVerifiedModule();
   Shape shape = ShapeUtil::MakeShape(F32, {128, 8});
   Shape tshape = ShapeUtil::MakeTupleShape({shape, shape});
   Shape result_tshape = ShapeUtil::MakeTupleShape({shape});
@@ -784,7 +781,7 @@ TEST_F(LayoutAssignmentTest, ConditionalAsymmetricLayout) {
     true_builder.AddInstruction(HloInstruction::CreateTuple({add}));
   }
   HloComputation* true_computation =
-      module->AddEmbeddedComputation(true_builder.Build());
+      m->AddEmbeddedComputation(true_builder.Build());
 
   auto false_builder = HloComputation::Builder(TestName() + "_FalseBranch");
   {
@@ -800,14 +797,14 @@ TEST_F(LayoutAssignmentTest, ConditionalAsymmetricLayout) {
     false_builder.AddInstruction(HloInstruction::CreateTuple({infeed_data}));
   }
   HloComputation* false_computation =
-      module->AddEmbeddedComputation(false_builder.Build());
+      m->AddEmbeddedComputation(false_builder.Build());
   builder.AddInstruction(HloInstruction::CreateConditional(
       result_tshape, pred, tuple, true_computation, tuple, false_computation));
 
-  HloComputation* computation = module->AddEntryComputation(builder.Build());
+  HloComputation* computation = m->AddEntryComputation(builder.Build());
   ComputationLayout computation_layout(computation->ComputeProgramShape());
 
-  AssignLayouts(module, &computation_layout);
+  AssignLayouts(m.get(), &computation_layout);
 
   const HloInstruction* true_root = true_computation->root_instruction();
   const HloInstruction* false_root = false_computation->root_instruction();
@@ -828,13 +825,13 @@ TEST_F(LayoutAssignmentTest, InternalErrorOnBitcast) {
           {{1.0, 2.0}, {3.0, 4.0}}, LayoutUtil::MakeLayout({0, 1}))));
   builder.AddInstruction(HloInstruction::CreateUnary(
       constant0->shape(), HloOpcode::kBitcast, constant0));
-  auto module = CreateNewModule();
-  module->AddEntryComputation(builder.Build());
+  auto m = CreateNewVerifiedModule();
+  m->AddEntryComputation(builder.Build());
 
   ComputationLayout computation_layout(
-      module->entry_computation()->ComputeProgramShape());
+      m->entry_computation()->ComputeProgramShape());
   LayoutAssignment layout_assignment(&computation_layout);
-  Status error_status = layout_assignment.Run(module).status();
+  Status error_status = layout_assignment.Run(m.get()).status();
   EXPECT_FALSE(error_status.ok());
   EXPECT_THAT(
       error_status.error_message(),
@@ -861,9 +858,10 @@ TEST_F(LayoutAssignmentTest, ChannelLayoutMismatch) {
     }
   )";
 
-  ParseAndVerifyModule(module_str);
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> m,
+                          ParseAndReturnVerifiedModule(module_str));
   ComputationLayout computation_layout(
-      module().entry_computation()->ComputeProgramShape());
+      m->entry_computation()->ComputeProgramShape());
   Shape param_shape = ShapeUtil::MakeTupleShape(
       {ShapeUtil::MakeShapeWithLayout(F32, {2, 2}, {0, 1})});
   TF_ASSERT_OK(
@@ -873,12 +871,12 @@ TEST_F(LayoutAssignmentTest, ChannelLayoutMismatch) {
       LayoutUtil::MakeLayout({1, 0}));
 
   ChannelLayoutConstraints channel_constraints;
-  AssignLayouts(&module(), &computation_layout, &channel_constraints);
+  AssignLayouts(m.get(), &computation_layout, &channel_constraints);
 
-  EXPECT_THAT(LayoutOf(&module(), "gte"), ElementsAre(0, 1));
-  EXPECT_THAT(LayoutOf(&module(), "root"), ElementsAre(1, 0));
+  EXPECT_THAT(LayoutOf(m.get(), "gte"), ElementsAre(0, 1));
+  EXPECT_THAT(LayoutOf(m.get(), "root"), ElementsAre(1, 0));
   EXPECT_TRUE(ShapeUtil::Equal(
-      ShapeUtil::GetSubshape(FindInstruction(&module(), "send")->shape(), {0}),
+      ShapeUtil::GetSubshape(FindInstruction(m.get(), "send")->shape(), {0}),
       ShapeUtil::MakeShapeWithLayout(F32, {2, 2}, {1, 0})));
 }
 
@@ -897,17 +895,17 @@ TEST_F(LayoutAssignmentTest, AllReduceLayoutMissmatch) {
       param = (f32[2,2]) parameter(0)
       gte = f32[2,2] get-tuple-element(param), index=0
       ar.0 = f32[2,2] cross-replica-sum(gte),
-        all_reduce_id=0, replica_groups={{0}}, to_apply=add,
+        all_reduce_id=1, replica_groups={{0}}, to_apply=add,
         sharding={maximal device=0}
       const = f32[2,2] constant(f32[2,2]{{0,1},{2,3}})
       ROOT ar.1 = f32[2,2] cross-replica-sum(const),
-        all_reduce_id=0, replica_groups={{0}}, to_apply=add,
+        all_reduce_id=1, replica_groups={{0}}, to_apply=add,
         sharding={maximal device=1}
     })";
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> m,
                           ParseAndReturnVerifiedModule(module_str));
   ComputationLayout computation_layout(
-      module->entry_computation()->ComputeProgramShape());
+      m->entry_computation()->ComputeProgramShape());
   Shape param_shape = ShapeUtil::MakeTupleShape(
       {ShapeUtil::MakeShapeWithLayout(F32, {2, 2}, {0, 1})});
   TF_ASSERT_OK(
@@ -917,12 +915,12 @@ TEST_F(LayoutAssignmentTest, AllReduceLayoutMissmatch) {
       LayoutUtil::MakeLayout({1, 0}));
 
   ChannelLayoutConstraints channel_constraints;
-  AssignLayouts(module.get(), &computation_layout, &channel_constraints);
+  AssignLayouts(m.get(), &computation_layout, &channel_constraints);
 
-  EXPECT_THAT(LayoutOf(module.get(), "gte"), ElementsAre(0, 1));
-  EXPECT_THAT(LayoutOf(module.get(), "ar.0"), ElementsAre(0, 1));
-  EXPECT_THAT(LayoutOf(module.get(), "ar.1"), ElementsAre(0, 1));
-  const HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(LayoutOf(m.get(), "gte"), ElementsAre(0, 1));
+  EXPECT_THAT(LayoutOf(m.get(), "ar.0"), ElementsAre(0, 1));
+  EXPECT_THAT(LayoutOf(m.get(), "ar.1"), ElementsAre(0, 1));
+  const HloInstruction* root = m->entry_computation()->root_instruction();
   EXPECT_THAT(root->shape().layout().minor_to_major(), ElementsAre(1, 0));
 }
 
@@ -938,11 +936,12 @@ TEST_F(LayoutAssignmentTest, CopySliceOperandToAvoidImplicitLayoutChange) {
     }
   )";
 
-  ParseAndVerifyModule(module_str);
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> m,
+                          ParseAndReturnVerifiedModule(module_str));
   auto compiled_module =
       backend()
           .compiler()
-          ->RunHloPasses(module().Clone(), backend().default_stream_executor(),
+          ->RunHloPasses(m->Clone(), backend().default_stream_executor(),
                          /*device_allocator=*/nullptr)
           .ConsumeValueOrDie();
   HloInstruction* root =
@@ -966,11 +965,12 @@ TEST_F(LayoutAssignmentTest, CopyDSliceOperandToAvoidImplicitLayoutChange) {
     }
   )";
 
-  ParseAndVerifyModule(module_str);
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> m,
+                          ParseAndReturnVerifiedModule(module_str));
   auto compiled_module =
       backend()
           .compiler()
-          ->RunHloPasses(module().Clone(), backend().default_stream_executor(),
+          ->RunHloPasses(m->Clone(), backend().default_stream_executor(),
                          /*device_allocator=*/nullptr)
           .ConsumeValueOrDie();
   HloInstruction* root =
@@ -997,11 +997,12 @@ TEST_F(LayoutAssignmentTest, CopyConcatOperandToAvoidImplicitLayoutChange) {
     }
   )";
 
-  ParseAndVerifyModule(module_str);
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> m,
+                          ParseAndReturnVerifiedModule(module_str));
   auto compiled_module =
       backend()
           .compiler()
-          ->RunHloPasses(module().Clone(), backend().default_stream_executor(),
+          ->RunHloPasses(m->Clone(), backend().default_stream_executor(),
                          /*device_allocator=*/nullptr)
           .ConsumeValueOrDie();
   HloInstruction* root =
@@ -1028,11 +1029,12 @@ TEST_F(LayoutAssignmentTest,
     }
   )";
 
-  ParseAndVerifyModule(module_str);
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> m,
+                          ParseAndReturnVerifiedModule(module_str));
   auto compiled_module =
       backend()
           .compiler()
-          ->RunHloPasses(module().Clone(), backend().default_stream_executor(),
+          ->RunHloPasses(m->Clone(), backend().default_stream_executor(),
                          /*device_allocator=*/nullptr)
           .ConsumeValueOrDie();
   HloInstruction* root =
@@ -1050,11 +1052,12 @@ TEST_F(LayoutAssignmentTest, PropagatingLayoutFromResultToOperand) {
     }
   )";
 
-  ParseAndVerifyModule(module_str);
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> m,
+                          ParseAndReturnVerifiedModule(module_str));
   auto compiled_module =
       backend()
           .compiler()
-          ->RunHloPasses(module().Clone(), backend().default_stream_executor(),
+          ->RunHloPasses(m->Clone(), backend().default_stream_executor(),
                          /*device_allocator=*/nullptr)
           .ConsumeValueOrDie();
   HloInstruction* root =
@@ -1107,20 +1110,21 @@ TEST_F(LayoutAssignmentTest, TupleCopyOnLayoutMismatch) {
     }
   )";
 
-  ParseAndVerifyModule(module_str);
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> m,
+                          ParseAndReturnVerifiedModule(module_str));
   ComputationLayout computation_layout(
-      module().entry_computation()->ComputeProgramShape());
+      m->entry_computation()->ComputeProgramShape());
 
   // Sanity check to verify that there's a layout mismatch.
-  EXPECT_THAT(LayoutOf(&module(), "ibuf"), ElementsAre(0, 1));
-  EXPECT_THAT(LayoutOf(&module(), "next_buf"), ElementsAre(1, 0));
+  EXPECT_THAT(LayoutOf(m.get(), "ibuf"), ElementsAre(0, 1));
+  EXPECT_THAT(LayoutOf(m.get(), "next_buf"), ElementsAre(1, 0));
 
-  AssignLayouts(&module(), &computation_layout);
+  AssignLayouts(m.get(), &computation_layout);
 
   // Make sure that layout assignment did not magically eliminate the mismatch,
   // in which case the test didn't prove anything.
-  EXPECT_THAT(LayoutOf(&module(), "ibuf"), ElementsAre(0, 1));
-  EXPECT_THAT(LayoutOf(&module(), "next_buf"), ElementsAre(1, 0));
+  EXPECT_THAT(LayoutOf(m.get(), "ibuf"), ElementsAre(0, 1));
+  EXPECT_THAT(LayoutOf(m.get(), "next_buf"), ElementsAre(1, 0));
 }
 
 TEST_F(LayoutAssignmentTest, CustomCallNotLayoutConstrained) {
@@ -1136,32 +1140,32 @@ ENTRY %CustomCallWithNotLayoutConstrained (p: f32[42,2,3]) -> f32[1,2,3,4] {
   // and result layout should match that of the computation.
   {
     TF_ASSERT_OK_AND_ASSIGN(
-        std::unique_ptr<VerifiedHloModule> module,
+        std::unique_ptr<VerifiedHloModule> m,
         ParseAndReturnVerifiedModule(module_str, GetModuleConfigForTest()));
-    ComputationLayout computation_layout = module->entry_computation_layout();
+    ComputationLayout computation_layout = m->entry_computation_layout();
     *computation_layout.mutable_parameter_layout(0) =
         ShapeLayout(ShapeUtil::MakeShapeWithLayout(F32, {42, 2, 3}, {0, 2, 1}));
     *computation_layout.mutable_result_layout() = ShapeLayout(
         ShapeUtil::MakeShapeWithLayout(F32, {1, 2, 3, 4}, {3, 2, 0, 1}));
-    AssignLayouts(module.get(), &computation_layout);
+    AssignLayouts(m.get(), &computation_layout);
 
-    HloInstruction* root = module->entry_computation()->root_instruction();
+    HloInstruction* root = m->entry_computation()->root_instruction();
     ASSERT_THAT(root, op::CustomCall(op::Parameter()));
     ExpectLayoutIs(root->shape(), {3, 2, 0, 1});
     ExpectLayoutIs(root->operand(0)->shape(), {0, 2, 1});
   }
   {
     TF_ASSERT_OK_AND_ASSIGN(
-        std::unique_ptr<VerifiedHloModule> module,
+        std::unique_ptr<VerifiedHloModule> m,
         ParseAndReturnVerifiedModule(module_str, GetModuleConfigForTest()));
-    ComputationLayout computation_layout = module->entry_computation_layout();
+    ComputationLayout computation_layout = m->entry_computation_layout();
     *computation_layout.mutable_parameter_layout(0) =
         ShapeLayout(ShapeUtil::MakeShapeWithLayout(F32, {42, 2, 3}, {0, 1, 2}));
     *computation_layout.mutable_result_layout() = ShapeLayout(
         ShapeUtil::MakeShapeWithLayout(F32, {1, 2, 3, 4}, {0, 2, 3, 1}));
-    AssignLayouts(module.get(), &computation_layout);
+    AssignLayouts(m.get(), &computation_layout);
 
-    HloInstruction* root = module->entry_computation()->root_instruction();
+    HloInstruction* root = m->entry_computation()->root_instruction();
     ASSERT_THAT(root, op::CustomCall(op::Parameter()));
     ExpectLayoutIs(root->shape(), {0, 2, 3, 1});
     ExpectLayoutIs(root->operand(0)->shape(), {0, 1, 2});
@@ -1179,24 +1183,24 @@ ENTRY %CustomCallWithLayoutConstraints (p0: f32[4,4], p1: f32[2,3]) -> f32[1,2,3
 }
 )";
   TF_ASSERT_OK_AND_ASSIGN(
-      std::unique_ptr<VerifiedHloModule> module,
+      std::unique_ptr<VerifiedHloModule> m,
       ParseAndReturnVerifiedModule(module_str, GetModuleConfigForTest()));
-  ComputationLayout computation_layout = module->entry_computation_layout();
+  ComputationLayout computation_layout = m->entry_computation_layout();
   *computation_layout.mutable_parameter_layout(0) =
       ShapeLayout(ShapeUtil::MakeShapeWithLayout(F32, {4, 4}, {1, 0}));
   *computation_layout.mutable_parameter_layout(1) =
       ShapeLayout(ShapeUtil::MakeShapeWithLayout(F32, {2, 3}, {1, 0}));
   *computation_layout.mutable_result_layout() = ShapeLayout(
       ShapeUtil::MakeShapeWithLayout(F32, {1, 2, 3, 4}, {2, 1, 0, 3}));
-  AssignLayouts(module.get(), &computation_layout);
+  AssignLayouts(m.get(), &computation_layout);
 
   // The custom call should be partially encapsulated in kCopy instructions
   // because of the layout mismatches.
-  ASSERT_THAT(module->entry_computation()->root_instruction(),
+  ASSERT_THAT(m->entry_computation()->root_instruction(),
               op::Copy(op::CustomCall(op::Copy(), op::Parameter())));
 
   const HloInstruction* custom_call =
-      module->entry_computation()->root_instruction()->operand(0);
+      m->entry_computation()->root_instruction()->operand(0);
   ExpectLayoutIs(custom_call->shape(), {3, 2, 0, 1});
   ExpectLayoutIs(custom_call->operand(0)->shape(), {0, 1});
   ExpectLayoutIs(custom_call->operand(1)->shape(), {1, 0});
@@ -1211,18 +1215,18 @@ ENTRY %CustomCallLayoutConstrainedZeroOperands () -> f32[1,2,3,4] {
 }
 )";
   TF_ASSERT_OK_AND_ASSIGN(
-      std::unique_ptr<VerifiedHloModule> module,
+      std::unique_ptr<VerifiedHloModule> m,
       ParseAndReturnVerifiedModule(module_str, GetModuleConfigForTest()));
-  ComputationLayout computation_layout = module->entry_computation_layout();
+  ComputationLayout computation_layout = m->entry_computation_layout();
   *computation_layout.mutable_result_layout() = ShapeLayout(
       ShapeUtil::MakeShapeWithLayout(F32, {1, 2, 3, 4}, {2, 1, 0, 3}));
-  AssignLayouts(module.get(), &computation_layout);
+  AssignLayouts(m.get(), &computation_layout);
 
-  ASSERT_THAT(module->entry_computation()->root_instruction(),
+  ASSERT_THAT(m->entry_computation()->root_instruction(),
               op::Copy(op::CustomCall()));
 
   const HloInstruction* custom_call =
-      module->entry_computation()->root_instruction()->operand(0);
+      m->entry_computation()->root_instruction()->operand(0);
   ExpectLayoutIs(custom_call->shape(), {3, 2, 0, 1});
 }
 
@@ -1238,25 +1242,25 @@ ENTRY %CustomCallLayoutConstrainedTupleOperand (p0: f32[4,4], p1: f32[2,3]) -> f
 }
 )";
   TF_ASSERT_OK_AND_ASSIGN(
-      std::unique_ptr<VerifiedHloModule> module,
+      std::unique_ptr<VerifiedHloModule> m,
       ParseAndReturnVerifiedModule(module_str, GetModuleConfigForTest()));
-  ComputationLayout computation_layout = module->entry_computation_layout();
+  ComputationLayout computation_layout = m->entry_computation_layout();
   *computation_layout.mutable_parameter_layout(0) =
       ShapeLayout(ShapeUtil::MakeShapeWithLayout(F32, {4, 4}, {1, 0}));
   *computation_layout.mutable_parameter_layout(1) =
       ShapeLayout(ShapeUtil::MakeShapeWithLayout(F32, {2, 3}, {1, 0}));
   *computation_layout.mutable_result_layout() = ShapeLayout(
       ShapeUtil::MakeShapeWithLayout(F32, {1, 2, 3, 4}, {2, 1, 0, 3}));
-  AssignLayouts(module.get(), &computation_layout);
+  AssignLayouts(m.get(), &computation_layout);
 
-  HloInstruction* root = module->entry_computation()->root_instruction();
+  HloInstruction* root = m->entry_computation()->root_instruction();
   ExpectLayoutIs(root->shape(), {2, 1, 0, 3});
 
-  ASSERT_THAT(module->entry_computation()->root_instruction(),
+  ASSERT_THAT(m->entry_computation()->root_instruction(),
               op::Copy(op::CustomCall(op::Tuple())));
 
   const HloInstruction* custom_call =
-      module->entry_computation()->root_instruction()->operand(0);
+      m->entry_computation()->root_instruction()->operand(0);
   ExpectLayoutIs(custom_call->shape(), {3, 2, 0, 1});
   ExpectTupleLayoutIs(custom_call->operand(0)->shape(), {{1, 0}, {0, 1}});
 }
@@ -1273,36 +1277,34 @@ ENTRY %CustomCallLayoutConstrainedTupleResult (p0: f32[4,4]) -> (f32[4,4]{1,0},
   // Try with a couple different layouts. In each case the custom calls operand
   // and result layout should match that of the computation.
   TF_ASSERT_OK_AND_ASSIGN(
-      std::unique_ptr<VerifiedHloModule> module,
+      std::unique_ptr<VerifiedHloModule> m,
       ParseAndReturnVerifiedModule(module_str, GetModuleConfigForTest()));
-  ComputationLayout computation_layout = module->entry_computation_layout();
+  ComputationLayout computation_layout = m->entry_computation_layout();
   *computation_layout.mutable_parameter_layout(0) =
       ShapeLayout(ShapeUtil::MakeShapeWithLayout(F32, {4, 4}, {1, 0}));
   *computation_layout.mutable_result_layout() =
       ShapeLayout(ShapeUtil::MakeTupleShape(
           {ShapeUtil::MakeShapeWithLayout(F32, {4, 4}, {1, 0}),
            ShapeUtil::MakeShapeWithLayout(F32, {2, 3}, {1, 0})}));
-  AssignLayouts(module.get(), &computation_layout);
+  AssignLayouts(m.get(), &computation_layout);
 
-  ExpectTupleLayoutIs(module->result_shape(), {{1, 0}, {1, 0}});
+  ExpectTupleLayoutIs(m->result_shape(), {{1, 0}, {1, 0}});
 
-  const HloInstruction* custom_call =
-      FindInstruction(module.get(), "custom-call");
+  const HloInstruction* custom_call = FindInstruction(m.get(), "custom-call");
   ExpectTupleLayoutIs(custom_call->shape(), {{1, 0}, {0, 1}});
 }
 
 Status AssignLayoutsToComputation(
-    HloModule* module,
-    ChannelLayoutConstraints* channel_constraints = nullptr) {
-  if (!module->entry_computation_layout().result_layout().LayoutIsSet()) {
-    module->mutable_entry_computation_layout()
+    HloModule* m, ChannelLayoutConstraints* channel_constraints = nullptr) {
+  if (!m->entry_computation_layout().result_layout().LayoutIsSet()) {
+    m->mutable_entry_computation_layout()
         ->mutable_result_layout()
         ->SetToDefaultLayout();
   }
   LayoutAssignment layout_assignment(
-      module->mutable_entry_computation_layout(),
+      m->mutable_entry_computation_layout(),
       LayoutAssignment::InstructionCanChangeLayout, channel_constraints);
-  return layout_assignment.Run(module).status();
+  return layout_assignment.Run(m).status();
 }
 
 TEST_F(LayoutAssignmentTest, OverwriteDiamondShapedConstraintsX) {
@@ -1325,16 +1327,16 @@ TEST_F(LayoutAssignmentTest, OverwriteDiamondShapedConstraintsX) {
   auto add = b.AddInstruction(
       HloInstruction::CreateBinary(ashape, HloOpcode::kAdd, transpose, param1));
   b.AddInstruction(HloInstruction::CreateTuple({add, transpose}));
-  auto module = CreateNewVerifiedModule();
-  module->AddEntryComputation(b.Build());
+  auto m = CreateNewVerifiedModule();
+  m->AddEntryComputation(b.Build());
   Shape ashape_major = ShapeUtil::MakeShapeWithLayout(F32, {12, 8}, {1, 0});
   Shape ashape_minor = ShapeUtil::MakeShapeWithLayout(F32, {12, 8}, {0, 1});
-  *module->mutable_entry_computation_layout()->mutable_result_layout() =
+  *m->mutable_entry_computation_layout()->mutable_result_layout() =
       ShapeLayout(ShapeUtil::MakeTupleShape({ashape_major, ashape_minor}));
   const Layout r2_dim0major = LayoutUtil::MakeLayout({1, 0});
-  ForceParameterLayout(module.get(), 0, r2_dim0major);
-  ForceParameterLayout(module.get(), 1, r2_dim0major);
-  TF_ASSERT_OK(AssignLayoutsToComputation(module.get()));
+  ForceParameterLayout(m.get(), 0, r2_dim0major);
+  ForceParameterLayout(m.get(), 1, r2_dim0major);
+  TF_ASSERT_OK(AssignLayoutsToComputation(m.get()));
 
   EXPECT_THAT(add->shape().layout().minor_to_major(), ElementsAre(1, 0));
   EXPECT_THAT(add->operand(0)->shape().layout().minor_to_major(),
diff --git a/tensorflow/compiler/xla/service/llvm_ir/BUILD b/tensorflow/compiler/xla/service/llvm_ir/BUILD
index 850501a4b5c..728a66b388f 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/BUILD
+++ b/tensorflow/compiler/xla/service/llvm_ir/BUILD
@@ -169,6 +169,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:elemental_ir_emitter",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/types:optional",
         "@com_google_absl//absl/types:span",
         "@llvm//:core",
     ],
@@ -197,14 +198,17 @@ cc_library(
     hdrs = ["sort_util.h"],
     deps = [
         ":ir_array",
+        ":kernel_support_library",
         ":llvm_loop",
         ":llvm_util",
         ":loop_emitter",
         "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/service/gpu:parallel_loop_emitter",
         "//tensorflow/compiler/xla/service/gpu:partition_assignment",
         "//tensorflow/core:lib",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
         "@llvm//:core",
         "@llvm//:support",
     ],
diff --git a/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc b/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc
index 2e5aebb74c2..df78726166e 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/Operator.h"
 #include "llvm/Target/TargetOptions.h"
@@ -83,10 +84,9 @@ string DumpModuleToString(const llvm::Module& module) {
   return AsString(buffer_string);
 }
 
-llvm::Value* EmitCallToIntrinsic(llvm::Intrinsic::ID intrinsic_id,
-                                 absl::Span<llvm::Value* const> operands,
-                                 absl::Span<llvm::Type* const> overloaded_types,
-                                 llvm::IRBuilder<>* b) {
+llvm::CallInst* EmitCallToIntrinsic(
+    llvm::Intrinsic::ID intrinsic_id, absl::Span<llvm::Value* const> operands,
+    absl::Span<llvm::Type* const> overloaded_types, llvm::IRBuilder<>* b) {
   llvm::Module* module = ModuleFromIRBuilder(b);
   llvm::Function* intrinsic = llvm::Intrinsic::getDeclaration(
       module, intrinsic_id, AsArrayRef(overloaded_types));
@@ -260,6 +260,17 @@ llvm::Constant* ConvertLiteralToIrConstant(const Literal& literal,
       /*AddNull=*/false);
 }
 
+llvm::GlobalVariable* AllocateSharedMemoryTile(llvm::Module* module,
+                                               llvm::Type* tile_type,
+                                               absl::string_view name) {
+  const int kNVPTXSharedMemoryAddrSpace = 3;
+  return new llvm::GlobalVariable(
+      *module, tile_type,
+      /*isConstant=*/false, llvm::GlobalValue::PrivateLinkage,
+      llvm::UndefValue::get(tile_type), AsStringRef(name), nullptr,
+      llvm::GlobalValue::NotThreadLocal, kNVPTXSharedMemoryAddrSpace);
+}
+
 llvm::AllocaInst* EmitAllocaAtFunctionEntry(llvm::Type* type,
                                             absl::string_view name,
                                             llvm::IRBuilder<>* b,
diff --git a/tensorflow/compiler/xla/service/llvm_ir/llvm_util.h b/tensorflow/compiler/xla/service/llvm_ir/llvm_util.h
index f59baff263f..c604c7c870a 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/llvm_util.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/llvm_util.h
@@ -24,6 +24,7 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Module.h"
@@ -101,10 +102,9 @@ string SanitizeFunctionName(string function_name);
 // intrinsics (for example, "minnum") must include a type in overloaded_types
 // for each overloaded type. Typically, overloaded intrinsics have only a single
 // overloaded type.
-llvm::Value* EmitCallToIntrinsic(llvm::Intrinsic::ID intrinsic_id,
-                                 absl::Span<llvm::Value* const> operands,
-                                 absl::Span<llvm::Type* const> overloaded_types,
-                                 llvm::IRBuilder<>* b);
+llvm::CallInst* EmitCallToIntrinsic(
+    llvm::Intrinsic::ID intrinsic_id, absl::Span<llvm::Value* const> operands,
+    absl::Span<llvm::Type* const> overloaded_types, llvm::IRBuilder<>* b);
 
 // Emit float max. Emit maxnum intrinsic is fast math is disabled, or
 // fcmp+select otherwise
@@ -155,6 +155,11 @@ StatusOr<Shape> DecodeSelfDescribingShapeConstant(const void* shape_ptr,
 llvm::Constant* ConvertLiteralToIrConstant(const Literal& literal,
                                            llvm::Module* module);
 
+// Allocates a tile of shared memory.
+llvm::GlobalVariable* AllocateSharedMemoryTile(llvm::Module* module,
+                                               llvm::Type* tile_type,
+                                               absl::string_view name);
+
 // Inserts an allocate of the requested type at the entry point of the
 // function that the builder is currently building. The insert point
 // of the builder is set to the same place after calling this function
diff --git a/tensorflow/compiler/xla/service/llvm_ir/sort_util.cc b/tensorflow/compiler/xla/service/llvm_ir/sort_util.cc
index 05ba4a40da4..fd16af67fe9 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/sort_util.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/sort_util.cc
@@ -18,7 +18,9 @@ limitations under the License.
 #include <vector>
 
 // IWYU pragma: no_include "llvm/IR/Intrinsics.gen.inc"
+#include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
+#include "absl/types/span.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constants.h"
@@ -28,10 +30,12 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.h"
 #include "tensorflow/compiler/xla/service/gpu/partition_assignment.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/ir_array.h"
+#include "tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_loop.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/loop_emitter.h"
 #include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -39,147 +43,352 @@ namespace xla {
 namespace llvm_ir {
 
 namespace {
-// Adds the inner comparison loop where we compare elements pointed to by
-// 'keys_index' and 'compare_keys_index'.
-void EmitCompareLoop(int64 dimension_to_sort, const IrArray::Index& keys_index,
-                     const IrArray::Index& compare_keys_index,
-                     const IrArray& keys_array,
-                     const std::vector<IrArray>& values_arrays,
-                     llvm::IRBuilder<>* b) {
-  // if (is_smaller_index &&
-  //     compare_keys[dimension_to_sort] < dimension_to_sort_bound)
-  llvm::Value* is_smaller_index = b->CreateICmpSLT(
-      keys_index[dimension_to_sort], compare_keys_index[dimension_to_sort]);
-  int64 dimension_to_sort_bound =
-      keys_array.GetShape().dimensions(dimension_to_sort);
-  auto if_data = EmitIfThenElse(
-      b->CreateAnd(is_smaller_index,
-                   b->CreateICmpSLT(compare_keys_index[dimension_to_sort],
-                                    keys_index.GetConstantWithIndexType(
-                                        dimension_to_sort_bound))),
-      "smaller_comparison_index", b, /*emit_else=*/false);
-  SetToFirstInsertPoint(if_data.true_block, b);
-  auto key1 = keys_array.EmitReadArrayElement(keys_index, b);
-  auto key2 = keys_array.EmitReadArrayElement(compare_keys_index, b);
-  auto compare_key1 = key1;
-  auto compare_key2 = key2;
-  auto key_type = keys_array.GetShape().element_type();
-  bool is_signed_comparison = true;
-  if (primitive_util::IsFloatingPointType(key_type)) {
-    // We would like a total order of floating point numbers so that the sort
-    // has a predictable behavior in the presence of NaNs. Rather than using
-    // floating point comparison, we use the following trick:
-    // If f is a float, and
-    // x = bit_cast<int32>(f);
-    // y = x < 0 ? 0x7FFFFFFF - x : x;
-    // then y is ordered as an int32 such that finite values have the obvious
-    // order, -0 is ordered before 0, and -NaN and NaN appear at the beginning
-    // and end of the ordering.
-    auto k = b->getInt(llvm::APInt::getSignedMaxValue(
-        key1->getType()->getPrimitiveSizeInBits()));
-    auto comparison_type = k->getType();
-    auto zero = llvm::ConstantInt::get(comparison_type, 0);
-    auto maybe_flip = [&](llvm::Value* v) {
-      return b->CreateSelect(b->CreateICmp(llvm::ICmpInst::ICMP_SLT, v, zero),
-                             b->CreateSub(k, v), v);
-    };
-    compare_key1 = b->CreateBitCast(key1, comparison_type);
-    compare_key2 = b->CreateBitCast(key2, comparison_type);
-    compare_key1 = maybe_flip(compare_key1);
-    compare_key2 = maybe_flip(compare_key2);
-  } else if (!primitive_util::IsSignedIntegralType(key_type)) {
-    is_signed_comparison = false;
+
+// Adds the inner comparison loop body where we compare elements.
+void EmitCompareLoopBody(
+    int64 iteration_bound, PrimitiveType key_type, int64 num_values,
+    llvm::Value* element_pair_index, int64 xor_mask, llvm::Type* index_type,
+    std::function<llvm::Value*(int64 operand, llvm::Value* index)> read_element,
+    std::function<void(int64 operand, llvm::Value* index, llvm::Value* value)>
+        write_element,
+    llvm::IRBuilder<>* b, bool needs_bounds_checks = true) {
+  auto index_typed_constant = [&](int64 value) {
+    return llvm::ConstantInt::get(index_type, value);
+  };
+  // The 'xor_mask' determines which elements are compared against each other.
+  // Index 'current_keys_index' will be compared with 'current_keys_index' xor
+  // 'xor_mask'. This means that we will always compare a block of consecutive
+  // elements against elements from the adjacent block of the same size. When
+  // 'xor_mask' is a power of 2, it immediately identifies the size of such a
+  // block. We can also have 'xor_mask' being 2^k - 1 (for some value of k). In
+  // that case, we essentially flip the last 'k' - 1 bits when computing the
+  // position of the element to compare to, so the block size is 2^(k - 1).
+  int64 block_size = xor_mask;
+  // Check if it is a value 2^k - 1.
+  if (xor_mask > 1 && (xor_mask & (xor_mask + 1)) == 0) {
+    block_size = (xor_mask + 1) / 2;
   }
-  auto comparison =
-      b->CreateICmp(is_signed_comparison ? llvm::ICmpInst::ICMP_SLT
-                                         : llvm::ICmpInst::ICMP_ULT,
-                    compare_key2, compare_key1);
-  // If key2 < key1
-  auto if_smaller_data =
-      EmitIfThenElse(comparison, "is_smaller_than", b, /*emit_else=*/false);
-  SetToFirstInsertPoint(if_smaller_data.true_block, b);
-  // Swap key1 with key2.
-  keys_array.EmitWriteArrayElement(keys_index, key2, b);
-  keys_array.EmitWriteArrayElement(compare_keys_index, key1, b);
-  for (const auto& values_array : values_arrays) {
-    // Also swap the values.
-    auto value1 = values_array.EmitReadArrayElement(keys_index, b);
-    auto value2 = values_array.EmitReadArrayElement(compare_keys_index, b);
-    values_array.EmitWriteArrayElement(keys_index, value2, b);
-    values_array.EmitWriteArrayElement(compare_keys_index, value1, b);
+  auto current_keys_index = element_pair_index;
+  if (block_size == 1) {
+    // If the block size is 1, we take every second element and compare it to
+    // the next one.
+    current_keys_index =
+        b->CreateMul(current_keys_index, index_typed_constant(2));
+  } else if (block_size * 2 < iteration_bound) {
+    // current_keys_index iterates through the 'left' elements of the element
+    // pairs to be compared. We first need to compute the comparison block to
+    // which the element belongs. The block id of that block is index /
+    // block_size.
+    auto block_id =
+        b->CreateUDiv(current_keys_index, index_typed_constant(block_size));
+    // The index of the 'left' element within its block is simply the remainder
+    // when dividing by 'block_size'.
+    auto index_within_block =
+        b->CreateURem(current_keys_index, index_typed_constant(block_size));
+    // The first element of the 'left' block of elements that is compared
+    // against elements from the adjacent 'right' block of elements is
+    // 'block_id' * (2 * 'block_size').
+    auto first_element_in_block =
+        b->CreateMul(block_id, index_typed_constant(2 * block_size));
+    current_keys_index =
+        b->CreateAdd(first_element_in_block, index_within_block);
   }
+  auto compare_keys_index =
+      b->CreateXor(current_keys_index, index_typed_constant(xor_mask));
+  // current_keys_index < compare_keys_index
+  llvm::Value* is_smaller_index =
+      b->CreateICmpSLT(current_keys_index, compare_keys_index);
+  // compare_keys_index < iteration_bound
+  llvm::Value* index_is_inbounds = b->CreateICmpSLT(
+      compare_keys_index, index_typed_constant(iteration_bound));
+  llvm::Value* do_comparison =
+      needs_bounds_checks ? b->CreateAnd(is_smaller_index, index_is_inbounds)
+                          : b->getInt1(true);
+
+  // if (is_smaller_index && index_is_inbounds)
+  KernelSupportLibrary ksl(b);
+  ksl.IfReturnVoid("smaller_comparison_index", do_comparison, [&]() {
+    auto key1 = read_element(0, current_keys_index);
+    auto key2 = read_element(0, compare_keys_index);
+    auto compare_key1 = key1;
+    auto compare_key2 = key2;
+    bool is_signed_comparison = true;
+    if (primitive_util::IsFloatingPointType(key_type)) {
+      // We would like a total order of floating point numbers so that the
+      // sort has a predictable behavior in the presence of NaNs. Rather
+      // than using floating point comparison, we use the following trick:
+      // If f is a float, and
+      // x = bit_cast<int32>(f);
+      // y = x < 0 ? 0x7FFFFFFF - x : x;
+      // then y is ordered as an int32 such that finite values have the
+      // obvious order, -0 is ordered before 0, and -NaN and NaN appear at
+      // the beginning and end of the ordering.
+      auto k = b->getInt(llvm::APInt::getSignedMaxValue(
+          key1->getType()->getPrimitiveSizeInBits()));
+      auto comparison_type = k->getType();
+      auto zero = llvm::ConstantInt::get(comparison_type, 0);
+      auto maybe_flip = [&](llvm::Value* v) {
+        return b->CreateSelect(b->CreateICmp(llvm::ICmpInst::ICMP_SLT, v, zero),
+                               b->CreateSub(k, v), v);
+      };
+      compare_key1 = b->CreateBitCast(key1, comparison_type);
+      compare_key2 = b->CreateBitCast(key2, comparison_type);
+      compare_key1 = maybe_flip(compare_key1);
+      compare_key2 = maybe_flip(compare_key2);
+    } else if (!primitive_util::IsSignedIntegralType(key_type)) {
+      is_signed_comparison = false;
+    }
+    // If key2 < key1
+    ksl.IfReturnVoid(
+        "is_smaller_than",
+        b->CreateICmp(is_signed_comparison ? llvm::ICmpInst::ICMP_SLT
+                                           : llvm::ICmpInst::ICMP_ULT,
+                      compare_key2, compare_key1),
+        [&]() {
+          // Swap key1 with key2.
+          write_element(0, current_keys_index, key2);
+          write_element(0, compare_keys_index, key1);
+          for (int64 i = 1; i <= num_values; ++i) {
+            // Also swap the values.
+            auto value1 = read_element(i, current_keys_index);
+            auto value2 = read_element(i, compare_keys_index);
+            write_element(i, current_keys_index, value2);
+            write_element(i, compare_keys_index, value1);
+          }
+        });
+  });
+}
+
+void EmitTiledCompareLoop(const IrArray::Index& tiled_keys_index,
+                          int64 dimension_to_sort,
+                          int64 dimension_to_sort_bound,
+                          PrimitiveType keys_type,
+                          absl::Span<const int64> xor_masks,
+                          const std::vector<IrArray>& params,
+                          const std::vector<llvm::Value*>& param_shmem_buffers,
+                          int64 tile_size, llvm::IRBuilder<>* b) {
+  KernelSupportLibrary ksl(b);
+  llvm::Value* thread_id = llvm_ir::EmitCallToIntrinsic(
+      llvm::Intrinsic::nvvm_read_ptx_sreg_tid_x, {}, {}, b);
+  llvm_ir::AddRangeMetadata(0, tile_size / 2,
+                            llvm::cast<llvm::Instruction>(thread_id));
+  thread_id = b->CreateIntCast(thread_id, tiled_keys_index.GetType(),
+                               /*isSigned=*/true, "thread.id.x");
+
+  auto copy_loop_body =
+      [&](std::function<void(llvm::Value * cache_index, llvm::Value * index)>
+              read_or_write) {
+        auto value_one = tiled_keys_index.GetConstantWithIndexType(1);
+        auto current_keys_index =
+            b->CreateShl(tiled_keys_index[dimension_to_sort], value_one);
+        // We want to copy two adjacent elements. We first check whether the
+        // first index position is within bounds.
+        ksl.IfReturnVoid(
+            "smaller_keys_index",
+            b->CreateICmpSLT(current_keys_index,
+                             tiled_keys_index.GetConstantWithIndexType(
+                                 dimension_to_sort_bound)),
+            [&]() {
+              auto cache_index = b->CreateShl(thread_id, value_one);
+              read_or_write(cache_index, current_keys_index);
+              // Increment to go the next index position.
+              current_keys_index = b->CreateAdd(current_keys_index, value_one);
+              // Here we check whether the next index position is within bounds.
+              ksl.IfReturnVoid(
+                  "inner_smaller_keys_index",
+                  b->CreateICmpSLT(current_keys_index,
+                                   tiled_keys_index.GetConstantWithIndexType(
+                                       dimension_to_sort_bound)),
+                  [&]() {
+                    cache_index = b->CreateAdd(cache_index, value_one);
+                    read_or_write(cache_index, current_keys_index);
+                  });
+            });
+      };
+
+  // Copy operand tiles from the operand buffers to shared memory.
+  IrArray::Index keys_index = tiled_keys_index;
+  for (int64 i = 0; i < params.size(); ++i) {
+    copy_loop_body([&](llvm::Value* cache_index, llvm::Value* index) {
+      keys_index[dimension_to_sort] = index;
+      auto value = params[i].EmitReadArrayElement(keys_index, b);
+      b->CreateStore(value,
+                     b->CreateGEP(param_shmem_buffers[i],
+                                  {tiled_keys_index.GetConstantWithIndexType(0),
+                                   cache_index}));
+    });
+  }
+  // Wait until all reads have happened.
+  llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::nvvm_barrier0, {}, {}, b);
+
+  // Now emit the bodies of the comparison loops.
+  auto read_element = [&](int64 operand, llvm::Value* index) {
+    return b->CreateLoad(
+        b->CreateGEP(param_shmem_buffers[operand],
+                     {tiled_keys_index.GetConstantWithIndexType(0), index}));
+  };
+  auto write_element = [&](int64 operand, llvm::Value* index,
+                           llvm::Value* value) {
+    b->CreateStore(
+        value,
+        b->CreateGEP(param_shmem_buffers[operand],
+                     {tiled_keys_index.GetConstantWithIndexType(0), index}));
+  };
+  for (int64 xor_mask : xor_masks) {
+    // The index of the element pair to be compared within the tile stored in
+    // shared memory. We order the element pairs by the element with the smaller
+    // index.
+    auto element_pair_index = thread_id;
+    // If 'dimension_to_sort_bound' is evenly divisible by 'tile_size', we don't
+    // need any bounds checks.
+    if (dimension_to_sort_bound % tile_size) {
+      // Otherwise we need a bounds check for the last tile. The last tile has
+      // size 'dimension_to_sort_bound' % 'tile_size'.
+      ksl.IfReturnVoid(
+          "is_last_tile",
+          b->CreateICmpUGE(
+              b->CreateMul(tiled_keys_index[dimension_to_sort],
+                           tiled_keys_index.GetConstantWithIndexType(2)),
+              tiled_keys_index.GetConstantWithIndexType(
+                  RoundDownToNearest(dimension_to_sort_bound, tile_size))),
+          [&]() {
+            EmitCompareLoopBody(dimension_to_sort_bound % tile_size, keys_type,
+                                params.size() - 1, element_pair_index, xor_mask,
+                                tiled_keys_index.GetType(), read_element,
+                                write_element, b);
+          },
+          [&]() {
+            EmitCompareLoopBody(
+                tile_size, keys_type, params.size() - 1, element_pair_index,
+                xor_mask, tiled_keys_index.GetType(), read_element,
+                write_element, b, /*needs_bounds_checks=*/false);
+          });
+    } else {
+      EmitCompareLoopBody(tile_size, keys_type, params.size() - 1,
+                          element_pair_index, xor_mask,
+                          tiled_keys_index.GetType(), read_element,
+                          write_element, b, /*needs_bounds_checks=*/false);
+    }
+    // Wait until all comparisons have happened.
+    llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::nvvm_barrier0, {}, {}, b);
+  }
+
+  // Copy the operand tiles back from shared memory to the operand buffers.
+  for (int64 i = 0; i < params.size(); ++i) {
+    copy_loop_body([&](llvm::Value* cache_index, llvm::Value* index) {
+      keys_index[dimension_to_sort] = index;
+      auto value = b->CreateLoad(b->CreateGEP(
+          param_shmem_buffers[i],
+          {tiled_keys_index.GetConstantWithIndexType(0), cache_index}));
+      params[i].EmitWriteArrayElement(keys_index, value, b);
+    });
+  }
+  // We should normally synchronize here to make sure all writes have happened.
+  // However the very next thing each thread does is reading 2 elements from the
+  // operand buffer and writing it into the same location in shared memory from
+  // which it previously copied it to the operand buffer, and we synchronize
+  // after this has happened. We can be sure that a thread always writes to the
+  // same location in shared memory because we have exactly tile_size / 2 many
+  // threads, and the linear index calculated by ParallelLoopEmitter uses
+  // linear_index = blockIdx.x * blockDim.x + threadIdx.x;
 }
 }  // namespace
 
 Status EmitSortInPlace(int64 dimension_to_sort, const IrArray& keys_array,
                        const std::vector<IrArray>& values_arrays,
-                       absl::string_view name, llvm::Value* xor_mask,
-                       llvm::IRBuilder<>* b,
-                       const gpu::LaunchDimensions* launch_dimensions) {
+                       absl::string_view name,
+                       absl::Span<const int64> xor_masks, llvm::IRBuilder<>* b,
+                       const gpu::LaunchDimensions& launch_dimensions,
+                       int64 num_iterations_in_sort_dim,
+                       const int64 tile_size) {
+  // Iterate through the keys shape in physical order, but skip the dimension to
+  // sort and make it the innermost loop which is the loop where the comparisons
+  // happen. In the dimension to sort, if we use tiling, we iterate through it
+  // in tiles of 64 elements each, so we use another loop that happens within
+  // one thread to process this tile worth of data (thereby combining several
+  // comparison stages of the bitonic sort algorithm because they all happen
+  // within those 64 elements and are therefore independent of the other
+  // comparisons).
+
   const Shape& keys_shape = keys_array.GetShape();
-
-  // Create loop nests which loop through the operand dimensions. The sort
-  // dimension is handled in the innermost loop which performs the sorting.
-  ForLoopNest loop_nest(name, b);
-  IrArray::Index keys_index =
-      loop_nest.EmitOperandArrayLoopNest(keys_array, dimension_to_sort, "keys");
-  if (loop_nest.GetInnerLoopBodyBasicBlock() != nullptr) {
-    SetToFirstInsertPoint(loop_nest.GetInnerLoopBodyBasicBlock(), b);
-  }
-
-  // 'compare_keys_index' is the index of the element that 'keys_index' should
-  // be compared to.
-  IrArray::Index compare_keys_index(keys_index.GetType());
-  for (size_t dimension = 0; dimension < keys_index.size(); ++dimension) {
+  int64 rank = ShapeUtil::Rank(keys_shape);
+  int64 dimension_to_sort_bound = keys_shape.dimensions(dimension_to_sort);
+  std::vector<int64> dimensions_in_iteration_order(rank);
+  std::vector<int64> iteration_order_to_logical_order(rank);
+  int64 dim = 0;
+  for (int64 dimension : LayoutUtil::MinorToMajor(keys_shape)) {
     if (dimension != dimension_to_sort) {
-      compare_keys_index.push_back(keys_index[dimension]);
-    } else {
-      compare_keys_index.push_back(nullptr);
+      dimensions_in_iteration_order[dim] = keys_shape.dimensions(dimension);
+      iteration_order_to_logical_order[dim++] = dimension;
+    }
+  }
+  dimensions_in_iteration_order[dim] = num_iterations_in_sort_dim;
+  iteration_order_to_logical_order[dim] = dimension_to_sort;
+
+  Shape iteration_shape = ShapeUtil::MakeShape(keys_shape.element_type(),
+                                               dimensions_in_iteration_order);
+  std::vector<IrArray> params(1, keys_array);
+  params.insert(params.end(), values_arrays.begin(), values_arrays.end());
+
+  // Allocate shared memory for the tiled compare loop.
+  std::vector<llvm::Value*> param_shmem_buffers(params.size(), nullptr);
+  if (xor_masks.size() > 1) {
+    llvm::Module* module = b->GetInsertBlock()->getParent()->getParent();
+    for (int64 i = 0; i < params.size(); ++i) {
+      llvm::Type* tile_type =
+          llvm::ArrayType::get(llvm_ir::PrimitiveTypeToIrType(
+                                   params[i].GetShape().element_type(), module),
+                               tile_size);
+      param_shmem_buffers[i] = llvm_ir::AllocateSharedMemoryTile(
+          module, tile_type, absl::StrCat(name, "_tile_param_", i));
     }
   }
 
-  // Naive C++ code for the inner compare loop:
-  //
-  // for (int64 i = 0; i < dimension_to_sort_bound; ++i) {
-  //   int64 j = i ^ xor_mask;
-  //   if (i < j && j < dimension_to_sort_bound) {
-  //     int64 min_key = std::min(keys[i], keys[j]);
-  //     keys[j] = std::max(keys[i], keys[j]);
-  //     keys[i] = min_key;
-  //   }
-  // }
-  //
-  // This follows the algorithm described on Wikipedia:
-  // https://en.wikipedia.org/wiki/Bitonic_sorter
-
-  int64 dimension_to_sort_bound =
-      keys_array.GetShape().dimensions(dimension_to_sort);
-  Shape compare_shape = ShapeUtil::MakeShape(keys_shape.element_type(),
-                                             {dimension_to_sort_bound});
   auto compare_loop_body_emitter =
-      [&](const IrArray::Index& compare_index) -> Status {
-    keys_index[dimension_to_sort] = compare_index[0];
-    compare_keys_index[dimension_to_sort] =
-        b->CreateXor(compare_index[0], xor_mask);
-    EmitCompareLoop(dimension_to_sort, keys_index, compare_keys_index,
-                    keys_array, values_arrays, b);
+      [&](const IrArray::Index& tiles_index) -> Status {
+    // Naive C++ code for the inner compare loop:
+    //
+    // for (int64 i = 0; i < dimension_to_sort_bound; ++i) {
+    //   int64 j = i ^ xor_mask;
+    //   /* emitted in EmitCompareLoopBody() */
+    //   if (i < j && j < dimension_to_sort_bound) {
+    //     int64 min_key = std::min(keys[i], keys[j]);
+    //     keys[j] = std::max(keys[i], keys[j]);
+    //     keys[i] = min_key;
+    //   }
+    // }
+    //
+    // This follows the algorithm described on Wikipedia:
+    // https://en.wikipedia.org/wiki/Bitonic_sorter
+    IrArray::Index keys_index(tiles_index.GetType(), rank);
+    for (int64 i = 0; i < rank; ++i) {
+      keys_index[iteration_order_to_logical_order[i]] = tiles_index[i];
+    }
+    if (xor_masks.size() > 1) {
+      EmitTiledCompareLoop(keys_index, dimension_to_sort,
+                           dimension_to_sort_bound, keys_shape.element_type(),
+                           xor_masks, params, param_shmem_buffers, tile_size,
+                           b);
+    } else {
+      auto read_element = [&](int64 operand, llvm::Value* index) {
+        keys_index[dimension_to_sort] = index;
+        return params[operand].EmitReadArrayElement(keys_index, b);
+      };
+      auto write_element = [&](int64 operand, llvm::Value* index,
+                               llvm::Value* value) {
+        keys_index[dimension_to_sort] = index;
+        params[operand].EmitWriteArrayElement(keys_index, value, b);
+      };
+      EmitCompareLoopBody(dimension_to_sort_bound, keys_shape.element_type(),
+                          values_arrays.size(), tiles_index[rank - 1],
+                          xor_masks[0], tiles_index.GetType(), read_element,
+                          write_element, b);
+    }
     return Status::OK();
   };
-  if (launch_dimensions != nullptr) {
-    TF_RETURN_IF_ERROR(gpu::ParallelLoopEmitter(compare_loop_body_emitter,
-                                                compare_shape,
-                                                *launch_dimensions, b)
-                           .EmitLoop(name));
-  } else {
-    TF_RETURN_IF_ERROR(LoopEmitter(compare_loop_body_emitter, compare_shape, b)
-                           .EmitLoop(name));
-  }
-
-  // Set the IR builder insert point to the exit basic block of the outer most
-  // loop. This ensures later instructions are inserted after this loop nest.
-  b->SetInsertPoint(loop_nest.GetOuterLoopExitBasicBlock());
-
-  return Status::OK();
+  return gpu::ParallelLoopEmitter(compare_loop_body_emitter, iteration_shape,
+                                  launch_dimensions, b)
+      .EmitLoop(name);
 }
 
 }  // namespace llvm_ir
diff --git a/tensorflow/compiler/xla/service/llvm_ir/sort_util.h b/tensorflow/compiler/xla/service/llvm_ir/sort_util.h
index 2f3bcda2307..556a217322d 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/sort_util.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/sort_util.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/strings/string_view.h"
+#include "absl/types/span.h"
 #include "llvm/IR/Value.h"
 #include "tensorflow/compiler/xla/service/gpu/partition_assignment.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/ir_array.h"
@@ -29,13 +30,14 @@ namespace xla {
 namespace llvm_ir {
 // Emits llvm IR to do pairwise comparisons/swaps in the 'dimension_to_sort'
 // dimension of 'keys_array'. All other dimensions are kept as-is. This
-// implements the inner loop of BitonicSort. If 'launch_dimensions' is nullptr,
-// the inner compare loop will not be parallelized.
+// implements the inner loop of BitonicSort. It is assumed that 'xor_masks'
+// contains only powers of 2, or values 2^k - 1 (k > 0).
 Status EmitSortInPlace(int64 dimension_to_sort, const IrArray& keys_array,
                        const std::vector<IrArray>& values_arrays,
-                       absl::string_view name, llvm::Value* xor_mask,
-                       llvm::IRBuilder<>* b,
-                       const gpu::LaunchDimensions* launch_dimensions);
+                       absl::string_view name,
+                       absl::Span<const int64> xor_masks, llvm::IRBuilder<>* b,
+                       const gpu::LaunchDimensions& launch_dimensions,
+                       int64 num_iterations_in_sort_dim, int64 tile_size);
 }  // namespace llvm_ir
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/service/map_inliner_test.cc b/tensorflow/compiler/xla/service/map_inliner_test.cc
index 84059dd0f71..fd18bfdc3e7 100644
--- a/tensorflow/compiler/xla/service/map_inliner_test.cc
+++ b/tensorflow/compiler/xla/service/map_inliner_test.cc
@@ -26,7 +26,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
-#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 
@@ -35,7 +35,7 @@ namespace op = xla::testing::opcode_matchers;
 namespace xla {
 namespace {
 
-using MapInlinerTest = HloVerifiedTestBase;
+using MapInlinerTest = HloTestBase;
 
 // Test that `map` with `max` is transformed to `max`
 TEST_F(MapInlinerTest, MapMax) {
@@ -59,12 +59,12 @@ TEST_F(MapInlinerTest, MapMax) {
       HloInstruction::CreateMap(lhs->shape(), {lhs, rhs}, max_f32.get()));
 
   auto computation = builder.Build();
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewVerifiedModule();
   hlo_module->AddEmbeddedComputation(std::move(max_f32));
   hlo_module->AddEntryComputation(std::move(computation));
 
   MapInliner inliner;
-  EXPECT_TRUE(inliner.Run(hlo_module).ValueOrDie());
+  EXPECT_TRUE(inliner.Run(hlo_module.get()).ValueOrDie());
   EXPECT_THAT(hlo_module->entry_computation()->root_instruction(),
               op::Maximum(lhs, rhs));
 
@@ -93,12 +93,12 @@ TEST_F(MapInlinerTest, MapConstant) {
       HloInstruction::CreateMap(lhs->shape(), {lhs}, const2_f32.get()));
 
   auto computation = builder.Build();
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewVerifiedModule();
   hlo_module->AddEmbeddedComputation(std::move(const2_f32));
   hlo_module->AddEntryComputation(std::move(computation));
   HloInstruction* root = hlo_module->entry_computation()->root_instruction();
   MapInliner inliner;
-  EXPECT_TRUE(inliner.Run(hlo_module).ValueOrDie());
+  EXPECT_TRUE(inliner.Run(hlo_module.get()).ValueOrDie());
   root = hlo_module->entry_computation()->root_instruction();
   EXPECT_THAT(root, op::Broadcast(op::Constant()));
 
@@ -131,12 +131,12 @@ TEST_F(MapInlinerTest, MapSubtractOppositeOrder) {
     HloInstruction::CreateMap(lhs->shape(), {lhs, rhs}, max_f32.get()));
 
   auto computation = builder.Build();
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewVerifiedModule();
   hlo_module->AddEmbeddedComputation(std::move(max_f32));
   hlo_module->AddEntryComputation(std::move(computation));
 
   MapInliner inliner;
-  EXPECT_TRUE(inliner.Run(hlo_module).ValueOrDie());
+  EXPECT_TRUE(inliner.Run(hlo_module.get()).ValueOrDie());
   EXPECT_THAT(hlo_module->entry_computation()->root_instruction(),
           op::Subtract(rhs, lhs));
 
diff --git a/tensorflow/compiler/xla/service/multi_output_fusion.cc b/tensorflow/compiler/xla/service/multi_output_fusion.cc
index 2ca527bc4cb..9ccdd7d8d81 100644
--- a/tensorflow/compiler/xla/service/multi_output_fusion.cc
+++ b/tensorflow/compiler/xla/service/multi_output_fusion.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "absl/container/flat_hash_set.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/hlo_reachability.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -257,7 +258,7 @@ bool MultiOutputFusion::LegalToFuse(HloInstruction* instr1,
 }
 
 void MultiOutputFusion::RecomputeReachability() {
-  reachability_ = computation_->ComputeReachability();
+  reachability_ = HloReachabilityMap::Build(computation_);
 }
 
 void MultiOutputFusion::UpdateReachability(
@@ -317,9 +318,9 @@ bool MultiOutputFusion::Perform() {
                 << instr2->fused_instructions_computation()->ToString(
                        HloPrintOptions().set_indent_amount(1));
       }
+      Update(instr1, instr2);
       HloInstruction* ret = Fuse(instr1, instr2);
       set_is_fused(ret == instr1 ? instr2 : instr1);
-      Update(instr1, instr2);
       changed = true;
       VLOG(2) << "After fusion, \t this: " << ret->name() << "\n"
               << ret->fused_instructions_computation()->ToString(
diff --git a/tensorflow/compiler/xla/service/multi_output_fusion.h b/tensorflow/compiler/xla/service/multi_output_fusion.h
index 9508ab2ed1d..1c7583ece72 100644
--- a/tensorflow/compiler/xla/service/multi_output_fusion.h
+++ b/tensorflow/compiler/xla/service/multi_output_fusion.h
@@ -23,6 +23,7 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+#include "tensorflow/compiler/xla/service/hlo_reachability.h"
 #include "tensorflow/compiler/xla/statusor.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/platform_util.cc b/tensorflow/compiler/xla/service/platform_util.cc
index c522e7ae23b..c227106511c 100644
--- a/tensorflow/compiler/xla/service/platform_util.cc
+++ b/tensorflow/compiler/xla/service/platform_util.cc
@@ -21,7 +21,7 @@ limitations under the License.
 
 #include "absl/strings/ascii.h"
 #include "absl/strings/str_join.h"
-#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
+#include "tensorflow/compiler/xla/debug_options_flags.h"
 #include "tensorflow/compiler/xla/service/compiler.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -59,20 +59,15 @@ string CanonicalPlatformName(const string& name) {
 
 /* static */ StatusOr<std::vector<se::Platform*>>
 PlatformUtil::GetSupportedPlatforms() {
-  se::MultiPlatformManager::PlatformMap platform_map;
-  se::port::Status platforms_status = se::MultiPlatformManager::WithPlatforms(
-      [&platform_map](se::MultiPlatformManager::PlatformMap* map) {
-        platform_map = *map;
-        return se::port::Status::OK();
-      });
-  if (platform_map.empty()) {
+  std::vector<se::Platform*> all_platforms =
+      se::MultiPlatformManager::AllPlatforms();
+  if (all_platforms.empty()) {
     LOG(WARNING) << "no executor platforms available: platform map is empty";
   }
 
   // Gather all platforms which have an XLA compiler.
   std::vector<se::Platform*> platforms;
-  for (auto& platform_pair : platform_map) {
-    auto* platform = platform_pair.second;
+  for (se::Platform* platform : all_platforms) {
     auto compiler_status = Compiler::GetForPlatform(platform);
     if (compiler_status.ok()) {
       platforms.push_back(platform);
@@ -222,8 +217,8 @@ PlatformUtil::GetStreamExecutors(se::Platform* platform) {
     // fix the number of devices to one.  However we do let the user override
     // this behavior to help run tests on the host that run models in parallel
     // across multiple devices.
-    device_count = legacy_flags::GetDebugOptionsFromFlags()
-                       .xla_force_host_platform_device_count();
+    device_count =
+        GetDebugOptionsFromFlags().xla_force_host_platform_device_count();
   }
   std::vector<se::StreamExecutor*> stream_executors(device_count, nullptr);
   VLOG(1) << "Initializing devices";
diff --git a/tensorflow/compiler/xla/service/reduce_precision_insertion.cc b/tensorflow/compiler/xla/service/reduce_precision_insertion.cc
index 688cceff0cd..b70cb705747 100644
--- a/tensorflow/compiler/xla/service/reduce_precision_insertion.cc
+++ b/tensorflow/compiler/xla/service/reduce_precision_insertion.cc
@@ -111,7 +111,7 @@ StatusOr<bool> ReducePrecisionInsertion::insert_on_inputs(
       VLOG(2) << "Adding to operand " << i << ": " << operand;
 
       if (!is_valid_shape(operand->shape())) {
-        VLOG(2) << "Skipped: value is not an F32 vector";
+        VLOG(2) << "Skipped: value is not of type F32";
         continue;
       }
 
@@ -168,7 +168,7 @@ StatusOr<bool> ReducePrecisionInsertion::insert_on_outputs(
             << instruction->ToString();
 
     if (!is_valid_shape(instruction->shape())) {
-      VLOG(2) << "Skipped: value is not an F32 nonscalar array";
+      VLOG(2) << "Skipped: value is not of type F32";
       continue;
     }
 
diff --git a/tensorflow/compiler/xla/service/reduce_precision_insertion.h b/tensorflow/compiler/xla/service/reduce_precision_insertion.h
index 0b4e82e8d60..76c6a87f176 100644
--- a/tensorflow/compiler/xla/service/reduce_precision_insertion.h
+++ b/tensorflow/compiler/xla/service/reduce_precision_insertion.h
@@ -118,13 +118,7 @@ class ReducePrecisionInsertion : public HloModulePass {
     // equivalent behavior can be obtained by adding ReducePrecision
     // instructions after the instructions that pull the F32 arrays out of
     // the tuples.
-    //
-    // TODO(b/64093391): Remove the IsScalar check once this won't cause
-    // failures on the GPU backend if the ReducePrecision instruction ends up
-    // inserted between a scalar constant and the init_value argument of a
-    // Reduce operation.
-    return shape.element_type() == PrimitiveType::F32 &&
-           !ShapeUtil::IsScalar(shape);
+    return shape.element_type() == PrimitiveType::F32;
   }
 
   // Is this instruction one such that following or preceding it with a new
diff --git a/tensorflow/compiler/xla/service/reduce_precision_insertion_test.cc b/tensorflow/compiler/xla/service/reduce_precision_insertion_test.cc
index 69e4b534bd8..16fa80d53e7 100644
--- a/tensorflow/compiler/xla/service/reduce_precision_insertion_test.cc
+++ b/tensorflow/compiler/xla/service/reduce_precision_insertion_test.cc
@@ -54,7 +54,34 @@ TEST_F(ReducePrecisionInsertionTest, BeforeUnaryInstruction) {
   HloInstruction* b = builder.AddInstruction(
       HloInstruction::CreateUnary(shape, HloOpcode::kCos, a));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
+  auto computation = module->AddEntryComputation(builder.Build());
+
+  // Confirm expected state before adding ops.
+  EXPECT_EQ(computation->root_instruction(), b);
+  EXPECT_EQ(b->operand(0), a);
+
+  EXPECT_TRUE(InsertOps(module.get(), HloReducePrecisionOptions::OP_INPUTS,
+                        [](const HloInstruction* instruction) {
+                          return instruction->opcode() == HloOpcode::kCos;
+                        }));
+
+  // Confirm expected graph after adding ops.
+  EXPECT_EQ(computation->root_instruction(), b);
+  EXPECT_THAT(b->operand(0), op::ReducePrecision(a));
+}
+
+TEST_F(ReducePrecisionInsertionTest, BeforeUnaryScalarInstruction) {
+  auto builder = HloComputation::Builder(TestName());
+  Shape shape = ShapeUtil::MakeShape(F32, {});
+
+  // Create a simple graph with a parameter feeding a unary cosine function.
+  HloInstruction* a =
+      builder.AddInstruction(HloInstruction::CreateParameter(0, shape, "a"));
+  HloInstruction* b = builder.AddInstruction(
+      HloInstruction::CreateUnary(shape, HloOpcode::kCos, a));
+
+  auto module = CreateNewUnverifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   // Confirm expected state before adding ops.
@@ -84,7 +111,7 @@ TEST_F(ReducePrecisionInsertionTest, BeforeBinaryInstruction) {
   HloInstruction* c = builder.AddInstruction(
       HloInstruction::CreateBinary(shape, HloOpcode::kAdd, a, b));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   // Confirm expected state before adding ops.
@@ -113,7 +140,7 @@ TEST_F(ReducePrecisionInsertionTest, BeforeZeroInputInstruction) {
   HloInstruction* b = builder.AddInstruction(
       HloInstruction::CreateUnary(shape, HloOpcode::kCos, a));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   // Confirm expected state before adding ops.
@@ -146,7 +173,7 @@ TEST_F(ReducePrecisionInsertionTest, AvoidAddingDuplicateInstructions) {
   HloInstruction* d = builder.AddInstruction(
       HloInstruction::CreateBinary(shape, HloOpcode::kAdd, b, c));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   // Confirm expected state before adding ops.
@@ -178,7 +205,7 @@ TEST_F(ReducePrecisionInsertionTest, AfterRootInstruction) {
   HloInstruction* b = builder.AddInstruction(
       HloInstruction::CreateUnary(shape, HloOpcode::kCos, a));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   // Confirm expected state before adding ops.
@@ -215,7 +242,7 @@ TEST_F(ReducePrecisionInsertionTest, AfterNonRootInstruction) {
   HloInstruction* c = builder.AddInstruction(
       HloInstruction::CreateBinary(shape, HloOpcode::kAdd, a_cos, b_cos));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   module->AddEntryComputation(builder.Build());
 
   // Confirm expected graph before adding ops.
@@ -242,7 +269,7 @@ TEST_F(ReducePrecisionInsertionTest, OutputIsNotFloat) {
   HloInstruction* y = builder.AddInstruction(
       HloInstruction::CreateUnary(shape, HloOpcode::kCos, x));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   // Confirm expected graph before adding ops.
@@ -268,7 +295,7 @@ TEST_F(ReducePrecisionInsertionTest, ShouldReduceOutputPrecisionIsFalse) {
   HloInstruction* y = builder.AddInstruction(
       HloInstruction::CreateUnary(shape, HloOpcode::kCos, x));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   // Confirm expected graph before adding ops.
@@ -294,7 +321,7 @@ TEST_F(ReducePrecisionInsertionTest, InsertionIsNotRecursive) {
   HloInstruction* b = builder.AddInstruction(
       HloInstruction::CreateReducePrecision(shape, a, 8, 23));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   // Confirm expected state before adding ops.
@@ -321,7 +348,7 @@ TEST_F(ReducePrecisionInsertionTest, SkipRedundantReducePrecisionAfter) {
   HloInstruction* y = builder.AddInstruction(
       HloInstruction::CreateReducePrecision(shape, x, 5, 10));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   // Confirm expected graph before adding ops.
@@ -349,7 +376,7 @@ TEST_F(ReducePrecisionInsertionTest, AddNonRedundantReducePrecision) {
   HloInstruction* y = builder.AddInstruction(
       HloInstruction::CreateReducePrecision(shape, x, 8, 23));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   // Confirm expected graph before adding ops.
@@ -375,7 +402,7 @@ TEST_F(ReducePrecisionInsertionTest, IgnoreOpsInsideFusionNode) {
       builder.AddInstruction(HloInstruction::CreateParameter(0, shape, "x"));
   HloInstruction* y = builder.AddInstruction(
       HloInstruction::CreateUnary(shape, HloOpcode::kCos, x));
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   // Manually fuse the kCos operation into a fusion operation.
@@ -411,7 +438,7 @@ TEST_F(ReducePrecisionInsertionTest, OpGetsInsertedInHeadOfFusionNode) {
       builder.AddInstruction(HloInstruction::CreateParameter(0, shape, "x"));
   HloInstruction* y = builder.AddInstruction(
       HloInstruction::CreateUnary(shape, HloOpcode::kCos, x));
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   // Manually fuse the kCos operation into a fusion operation.
@@ -458,7 +485,7 @@ TEST_F(ReducePrecisionInsertionTest, OpGetsInsertedInTailOfFusionNode) {
       builder.AddInstruction(HloInstruction::CreateParameter(0, shape, "x"));
   HloInstruction* y = builder.AddInstruction(
       HloInstruction::CreateUnary(shape, HloOpcode::kCos, x));
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   // Manually fuse the kCos operation into a fusion operation.
diff --git a/tensorflow/compiler/xla/service/reshape_mover_test.cc b/tensorflow/compiler/xla/service/reshape_mover_test.cc
index fcf269eee92..341659b15c4 100644
--- a/tensorflow/compiler/xla/service/reshape_mover_test.cc
+++ b/tensorflow/compiler/xla/service/reshape_mover_test.cc
@@ -25,7 +25,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
-#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 
@@ -34,9 +34,10 @@ namespace {
 
 namespace op = xla::testing::opcode_matchers;
 
-class ReshapeMoverTest : public HloVerifiedTestBase {};
+class ReshapeMoverTest : public HloTestBase {};
 
 TEST_F(ReshapeMoverTest, ReshapesWithDifferentInputShapesNotMoved) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   auto root_shape = ShapeUtil::MakeShape(F32, {8, 7});
   auto param0 = builder.AddInstruction(HloInstruction::CreateParameter(
@@ -50,12 +51,12 @@ TEST_F(ReshapeMoverTest, ReshapesWithDifferentInputShapesNotMoved) {
   builder.AddInstruction(HloInstruction::CreateBinary(
       root_shape, HloOpcode::kAdd, reshape0, reshape1));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Add(op::Reshape(param0), op::Reshape(param1)));
 
-  EXPECT_FALSE(ReshapeMover().Run(&module()).ValueOrDie());
+  EXPECT_FALSE(ReshapeMover().Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Add(op::Reshape(param0), op::Reshape(param1)));
@@ -74,6 +75,7 @@ TEST_F(ReshapeMoverTest, ReshapesWithDifferentInputShapesNotMoved) {
 // Verifies that the reshape is not moved, since rng0 is trivially reshapable
 // and therefore there is no nontrivial reshapes to move.
 TEST_F(ReshapeMoverTest, 1ConstantAnd1ReshapesOnRngNotMoved) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   auto root_shape = ShapeUtil::MakeShape(F32, {8, 7});
   auto rng0 = builder.AddInstruction(HloInstruction::CreateRng(
@@ -92,18 +94,19 @@ TEST_F(ReshapeMoverTest, 1ConstantAnd1ReshapesOnRngNotMoved) {
   builder.AddInstruction(HloInstruction::CreateBinary(
       root_shape, HloOpcode::kAdd, reshape0, const1));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Add(op::Reshape(rng0), const1));
 
-  EXPECT_FALSE(ReshapeMover().Run(&module()).ValueOrDie());
+  EXPECT_FALSE(ReshapeMover().Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Add(op::Reshape(rng0), const1));
 }
 
 TEST_F(ReshapeMoverTest, ScalarReshapesNotMoved) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   auto root_shape = ShapeUtil::MakeShape(F32, {});
   auto param0 = builder.AddInstruction(HloInstruction::CreateParameter(
@@ -117,12 +120,12 @@ TEST_F(ReshapeMoverTest, ScalarReshapesNotMoved) {
   builder.AddInstruction(HloInstruction::CreateBinary(
       root_shape, HloOpcode::kAdd, reshape0, reshape1));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Add(op::Reshape(param0), op::Reshape(param1)));
 
-  EXPECT_FALSE(ReshapeMover().Run(&module()).ValueOrDie());
+  EXPECT_FALSE(ReshapeMover().Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(
       computation->root_instruction(),
@@ -130,6 +133,7 @@ TEST_F(ReshapeMoverTest, ScalarReshapesNotMoved) {
 }
 
 TEST_F(ReshapeMoverTest, EquivalentReshapesMoved) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   auto root_shape = ShapeUtil::MakeShape(F32, {8, 7});
   auto param0 = builder.AddInstruction(HloInstruction::CreateParameter(
@@ -143,11 +147,11 @@ TEST_F(ReshapeMoverTest, EquivalentReshapesMoved) {
   builder.AddInstruction(HloInstruction::CreateBinary(
       root_shape, HloOpcode::kAdd, reshape0, reshape1));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Add(op::Reshape(param0), op::Reshape(param1)));
-  EXPECT_TRUE(ReshapeMover().Run(&module()).ValueOrDie());
+  EXPECT_TRUE(ReshapeMover().Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Reshape(op::Add(param0, param1)));
@@ -177,6 +181,7 @@ TEST_F(ReshapeMoverTest, EquivalentReshapesMoved) {
 // |
 // reshape4
 TEST_F(ReshapeMoverTest, 1ConstantAnd2ReshapesMoved) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   auto root_shape = ShapeUtil::MakeShape(F32, {2, 3});
   auto const0 = builder.AddInstruction(
@@ -196,12 +201,12 @@ TEST_F(ReshapeMoverTest, 1ConstantAnd2ReshapesMoved) {
   builder.AddInstruction(HloInstruction::CreateTernary(
       root_shape, HloOpcode::kSelect, const0, reshape1, reshape2));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Select(const0, reshape1, reshape2));
 
-  EXPECT_TRUE(ReshapeMover().Run(&module()).ValueOrDie());
+  EXPECT_TRUE(ReshapeMover().Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Reshape(op::Select(op::Reshape(const0), param1, param2)));
@@ -221,6 +226,7 @@ TEST_F(ReshapeMoverTest, 1ConstantAnd2ReshapesMoved) {
 // Verifies that the reshape0 does not sink below add, because param1 is not
 // trivially reshapable nor is a Reshape/Transpose.
 TEST_F(ReshapeMoverTest, 1ParameterAnd1ReshapeNotMoved) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   auto root_shape = ShapeUtil::MakeShape(F32, {8, 7});
   auto param0 = builder.AddInstruction(HloInstruction::CreateParameter(
@@ -232,11 +238,11 @@ TEST_F(ReshapeMoverTest, 1ParameterAnd1ReshapeNotMoved) {
   builder.AddInstruction(HloInstruction::CreateBinary(
       root_shape, HloOpcode::kAdd, reshape0, param1));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Add(op::Reshape(param0), param1));
-  EXPECT_FALSE(ReshapeMover().Run(&module()).ValueOrDie());
+  EXPECT_FALSE(ReshapeMover().Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Add(op::Reshape(param0), param1));
@@ -257,6 +263,7 @@ TEST_F(ReshapeMoverTest, 1ParameterAnd1ReshapeNotMoved) {
 // Verifies that we don't unnecessarily sink reshapes, which are in fact
 // trivial reshapes.
 TEST_F(ReshapeMoverTest, 2TrivialConstantReshapeNotMoved) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   auto root_shape = ShapeUtil::MakeShape(F32, {3, 2});
   auto const0 = builder.AddInstruction(HloInstruction::CreateConstant(
@@ -275,12 +282,12 @@ TEST_F(ReshapeMoverTest, 2TrivialConstantReshapeNotMoved) {
   builder.AddInstruction(HloInstruction::CreateTernary(
       root_shape, HloOpcode::kSelect, pred, reshape0, reshape1));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Select(pred, op::Reshape(const0), op::Reshape(const1)));
 
-  EXPECT_FALSE(ReshapeMover().Run(&module()).ValueOrDie());
+  EXPECT_FALSE(ReshapeMover().Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Select(pred, op::Reshape(const0), op::Reshape(const1)));
@@ -309,6 +316,7 @@ TEST_F(ReshapeMoverTest, 2TrivialConstantReshapeNotMoved) {
 //
 // (note that reshape1 here is trivial).
 TEST_F(ReshapeMoverTest, 1NonTrivialReshapeMoved) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   auto root_shape = ShapeUtil::MakeShape(F32, {2, 3});
   auto param0 = builder.AddInstruction(HloInstruction::CreateParameter(
@@ -320,12 +328,12 @@ TEST_F(ReshapeMoverTest, 1NonTrivialReshapeMoved) {
   builder.AddInstruction(HloInstruction::CreateBinary(
       root_shape, HloOpcode::kAdd, reshape0, const1));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Add(op::Reshape(param0), const1));
 
-  EXPECT_TRUE(ReshapeMover().Run(&module()).ValueOrDie());
+  EXPECT_TRUE(ReshapeMover().Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Reshape(op::Add(param0, op::Reshape(const1))));
@@ -348,6 +356,7 @@ TEST_F(ReshapeMoverTest, 1NonTrivialReshapeMoved) {
 // For now we treat it as non-trivial, so we verify that we don't sink the
 // reshapes in this case.
 TEST_F(ReshapeMoverTest, 1NonTrivialReshapeWith1ReshapedConstNotMoved) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   auto root_shape = ShapeUtil::MakeShape(F32, {1, 1, 3});
   auto param0 = builder.AddInstruction(HloInstruction::CreateParameter(
@@ -362,12 +371,12 @@ TEST_F(ReshapeMoverTest, 1NonTrivialReshapeWith1ReshapedConstNotMoved) {
   builder.AddInstruction(HloInstruction::CreateBinary(
       root_shape, HloOpcode::kAdd, reshape0, reshape1));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Add(op::Reshape(param0), op::Reshape(const1)));
 
-  EXPECT_FALSE(ReshapeMover().Run(&module()).ValueOrDie());
+  EXPECT_FALSE(ReshapeMover().Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Add(op::Reshape(param0), op::Reshape(const1)));
@@ -376,6 +385,7 @@ TEST_F(ReshapeMoverTest, 1NonTrivialReshapeWith1ReshapedConstNotMoved) {
 }
 
 TEST_F(ReshapeMoverTest, EquivalentReshapesMovedAcrossFusion) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   auto root_shape = ShapeUtil::MakeShape(F32, {8, 7});
   auto param0 = builder.AddInstruction(HloInstruction::CreateParameter(
@@ -389,14 +399,14 @@ TEST_F(ReshapeMoverTest, EquivalentReshapesMovedAcrossFusion) {
   auto add = builder.AddInstruction(HloInstruction::CreateBinary(
       root_shape, HloOpcode::kAdd, reshape0, reshape1));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
   computation->CreateFusionInstruction({add},
                                        HloInstruction::FusionKind::kLoop);
 
   EXPECT_THAT(computation->root_instruction(),
               op::Fusion(op::Reshape(param0), op::Reshape(param1)));
 
-  EXPECT_TRUE(ReshapeMover().Run(&module()).ValueOrDie());
+  EXPECT_TRUE(ReshapeMover().Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Reshape(op::Fusion(param0, param1)));
@@ -405,6 +415,7 @@ TEST_F(ReshapeMoverTest, EquivalentReshapesMovedAcrossFusion) {
 }
 
 TEST_F(ReshapeMoverTest, EquivalentReshapesMovedAcrossSelect) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   auto root_shape = ShapeUtil::MakeShape(F32, {8, 7});
   auto pred_shape = ShapeUtil::MakeShape(PRED, {8, 7});
@@ -423,13 +434,13 @@ TEST_F(ReshapeMoverTest, EquivalentReshapesMovedAcrossSelect) {
   builder.AddInstruction(HloInstruction::CreateTernary(
       root_shape, HloOpcode::kSelect, reshape_pred, reshape0, reshape1));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(
       computation->root_instruction(),
       op::Select(op::Reshape(pred), op::Reshape(param0), op::Reshape(param1)));
 
-  EXPECT_TRUE(ReshapeMover().Run(&module()).ValueOrDie());
+  EXPECT_TRUE(ReshapeMover().Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Reshape(op::Select(pred, param0, param1)));
@@ -438,6 +449,7 @@ TEST_F(ReshapeMoverTest, EquivalentReshapesMovedAcrossSelect) {
 }
 
 TEST_F(ReshapeMoverTest, ScalarReshapeNotMovedAcrossSelect) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   auto root_shape = ShapeUtil::MakeShape(F32, {});
   auto pred_shape = ShapeUtil::MakeShape(PRED, {});
@@ -452,11 +464,11 @@ TEST_F(ReshapeMoverTest, ScalarReshapeNotMovedAcrossSelect) {
   auto select = builder.AddInstruction(HloInstruction::CreateTernary(
       root_shape, HloOpcode::kSelect, reshape_pred, param0, param1));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
   EXPECT_THAT(computation->root_instruction(),
               op::Select(op::Reshape(pred), param0, param1));
 
-  EXPECT_FALSE(ReshapeMover().Run(&module()).ValueOrDie());
+  EXPECT_FALSE(ReshapeMover().Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Select(op::Reshape(pred), param0, param1));
@@ -477,6 +489,7 @@ TEST_F(ReshapeMoverTest, ScalarReshapeNotMovedAcrossSelect) {
 //
 // We expect reshape{0,1} AND reshape{2,3} to be lifted.
 TEST_F(ReshapeMoverTest, MultiplePasses) {
+  auto m = CreateNewVerifiedModule();
   auto shape1 = ShapeUtil::MakeShape(F32, {1, 8, 1, 7});
   auto shape2 = ShapeUtil::MakeShape(F32, {8, 7, 1});
   auto shape3 = ShapeUtil::MakeShape(F32, {8, 7});
@@ -500,14 +513,14 @@ TEST_F(ReshapeMoverTest, MultiplePasses) {
   builder.AddInstruction(HloInstruction::CreateBinary(shape3, HloOpcode::kAdd,
                                                       reshape2, reshape3));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(
       computation->root_instruction(),
       op::Add(op::Reshape(param2),
               op::Reshape(op::Add(op::Reshape(param0), op::Reshape(param1)))));
 
-  EXPECT_TRUE(ReshapeMover().Run(&module()).ValueOrDie());
+  EXPECT_TRUE(ReshapeMover().Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(
       computation->root_instruction(),
@@ -526,11 +539,11 @@ TEST_F(ReshapeMoverTest, SinkTransposeAcrossBroadcastScalar) {
     }
   )";
 
-  ParseAndVerifyModule(hlo_string);
-  TF_ASSERT_OK_AND_ASSIGN(bool changed, ReshapeMover().Run(&module()));
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, ReshapeMover().Run(m.get()));
   EXPECT_TRUE(changed);
 
-  EXPECT_THAT(module().entry_computation()->root_instruction(),
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
               op::Transpose(op::Multiply()));
 }
 
@@ -555,8 +568,8 @@ TEST_F(ReshapeMoverTest, ReshapeWithUsersOutsideCandidatesNotSink) {
     }
   )";
 
-  ParseAndVerifyModule(hlo_string);
-  TF_ASSERT_OK_AND_ASSIGN(bool changed, ReshapeMover().Run(&module()));
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, ReshapeMover().Run(m.get()));
   EXPECT_FALSE(changed);
 }
 
@@ -580,10 +593,10 @@ TEST_F(ReshapeMoverTest, ReshapeNoUsersOutsideCandidatesSink1) {
     }
   )";
 
-  ParseAndVerifyModule(hlo_string);
-  TF_ASSERT_OK_AND_ASSIGN(bool changed, ReshapeMover().Run(&module()));
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, ReshapeMover().Run(m.get()));
   EXPECT_TRUE(changed);
-  EXPECT_THAT(module().entry_computation()->root_instruction(),
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
               op::Tuple(op::Reshape(), op::Reshape(), op::Reshape()));
 }
 
@@ -597,10 +610,10 @@ TEST_F(ReshapeMoverTest, ReshapeNoUsersOutsideCandidatesSink2) {
     }
   )";
 
-  ParseAndVerifyModule(hlo_string);
-  TF_ASSERT_OK_AND_ASSIGN(bool changed, ReshapeMover().Run(&module()));
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, ReshapeMover().Run(m.get()));
   EXPECT_TRUE(changed);
-  EXPECT_THAT(module().entry_computation()->root_instruction(),
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
               op::Reshape(op::Add()));
 }
 
diff --git a/tensorflow/compiler/xla/service/service.cc b/tensorflow/compiler/xla/service/service.cc
index 6f9094a5c2e..75f7413b3c3 100644
--- a/tensorflow/compiler/xla/service/service.cc
+++ b/tensorflow/compiler/xla/service/service.cc
@@ -23,9 +23,9 @@ limitations under the License.
 #include "absl/memory/memory.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
+#include "tensorflow/compiler/xla/debug_options_flags.h"
 #include "tensorflow/compiler/xla/execution_options_util.h"
 #include "tensorflow/compiler/xla/layout_util.h"
-#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/service/compiler.h"
 #include "tensorflow/compiler/xla/service/computation_layout.h"
 #include "tensorflow/compiler/xla/service/device_memory_allocator.h"
@@ -292,7 +292,7 @@ StatusOr<std::unique_ptr<HloModuleConfig>> Service::CreateModuleConfig(
     config->set_seed(execution_options->seed());
     config->set_debug_options(execution_options->debug_options());
   } else {
-    config->set_debug_options(legacy_flags::GetDebugOptionsFromFlags());
+    config->set_debug_options(GetDebugOptionsFromFlags());
   }
 
   if (execute_backend_ != nullptr &&
@@ -760,38 +760,6 @@ Status Service::GetDeviceHandles(const GetDeviceHandlesRequest* arg,
   return Status::OK();
 }
 
-Status Service::ExecuteOneToN(const ExecuteGraphRequest* arg,
-                              ExecuteResponse* result) {
-  ExecuteGraphParallelRequest parallel_arg;
-  *parallel_arg.add_requests() = *arg;
-  ExecuteParallelResponse parallel_result;
-  TF_RETURN_IF_ERROR(ExecuteGraphParallel(&parallel_arg, &parallel_result));
-  return PickParallelResponse(parallel_result, result);
-}
-
-Status Service::PickParallelResponse(
-    const ExecuteParallelResponse& parallel_result, ExecuteResponse* result) {
-  // The "result device" selection is a bit hacky, but better than assuming it
-  // is device 0. We have b/76035356 for restructuring the client API to clean
-  // up the current asymmetries and support more functionalities.
-  for (int64 i = 0; i < parallel_result.responses_size(); ++i) {
-    TF_ASSIGN_OR_RETURN(const ShapedBuffer* buffer,
-                        allocation_tracker_.ResolveForReplica(
-                            parallel_result.responses(i).output(), 0));
-    const Shape& shape = buffer->on_host_shape();
-    if (!ShapeUtil::IsEmptyTuple(shape)) {
-      *result = parallel_result.responses(i);
-      VLOG(3) << "Fetching result from device " << i << ": "
-              << ShapeUtil::HumanString(shape);
-      return Status::OK();
-    }
-  }
-  TF_RET_CHECK(parallel_result.responses_size() > 0);
-  *result = parallel_result.responses(0);
-  VLOG(1) << "Defaulting to device 0 result";
-  return Status::OK();
-}
-
 StatusOr<std::unique_ptr<Executable>> Service::BuildExecutable(
     const HloModuleProto& module_proto,
     std::unique_ptr<HloModuleConfig> module_config, Backend* backend,
@@ -836,10 +804,8 @@ StatusOr<std::unique_ptr<Executable>> Service::BuildExecutable(
   return std::move(executable);
 }
 
-Status Service::ExecuteGraph(const ExecuteGraphRequest* arg,
-                             ExecuteResponse* result) {
-  VLOG(1) << "running execute-graph request";
-
+Status Service::Compile(const CompileRequest* arg, CompileResponse* result) {
+  VLOG(1) << "running compile request";
   if (!arg->has_computation()) {
     return InvalidArgument("computations may not be empty");
   }
@@ -847,22 +813,21 @@ Status Service::ExecuteGraph(const ExecuteGraphRequest* arg,
     return InvalidArgument("programe shape may not be empty");
   }
 
-  // If we received multiple device handles, we must partition the module.
   if (arg->execution_options().device_handles_size() > 1) {
-    return ExecuteOneToN(arg, result);
+    return InvalidArgument(
+        "The compile request does not support multiple device handles.");
   }
 
-  TF_ASSIGN_OR_RETURN(auto replicas, Replicas(*execute_backend_,
-                                              SingleComputationDeviceHandle()));
-  TF_ASSIGN_OR_RETURN(
-      std::vector<std::vector<const ShapedBuffer*>> replicated_arguments,
-      ResolveAndValidateArguments(arg->arguments(), replicas));
-
+  std::vector<const Shape*> argument_shapes;
+  absl::c_transform(arg->input_shape_with_layout(),
+                    std::back_inserter(argument_shapes),
+                    [](const Shape& shape) { return &shape; });
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<HloModuleConfig> module_config,
       CreateModuleConfig(arg->computation().host_program_shape(),
-                         replicated_arguments.front(),
-                         arg->execution_options()));
+                         argument_shapes, &arg->execution_options()));
+  VLOG(3) << "Compile created HloModuleConfig computation layout: "
+          << module_config->entry_computation_layout().ToString();
 
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<Executable> executable,
@@ -871,6 +836,48 @@ Status Service::ExecuteGraph(const ExecuteGraphRequest* arg,
                       execute_backend_->default_stream_executor(),
                       /*device_allocator=*/nullptr));
 
+  *result->mutable_handle() = compilation_cache_.Insert(std::move(executable));
+
+  VLOG(1) << "successfully completed 'compile' request";
+  return Status::OK();
+}
+
+Status Service::Execute(const ExecuteRequest* arg, ExecuteResponse* result) {
+  VLOG(1) << "running execute request";
+  if (!arg->has_handle()) {
+    return InvalidArgument("execution handle should not be empty");
+  }
+  TF_ASSIGN_OR_RETURN(auto executable,
+                      compilation_cache_.LookUp(arg->handle()));
+
+  TF_ASSIGN_OR_RETURN(auto replicas, Replicas(*execute_backend_,
+                                              SingleComputationDeviceHandle()));
+  TF_ASSIGN_OR_RETURN(
+      std::vector<std::vector<const ShapedBuffer*>> replicated_arguments,
+      ResolveAndValidateArguments(arg->arguments(), replicas));
+
+  // Check that the replicated_arguments has the same shape and layout as the
+  // module config used when creating the exectuable.
+  const int64 num_module_args =
+      executable->module_config().entry_computation_layout().parameter_count();
+  if (num_module_args != arg->arguments_size()) {
+    return InvalidArgument(
+        "The executable expects %lld arguments, but sees %lld.",
+        num_module_args, arg->arguments_size());
+  }
+  for (int64 i = 0; i < num_module_args; i++) {
+    const Shape& shape_module =
+        executable->module_config().entry_computation_layout().parameter_shape(
+            i);
+    const Shape& shape_arg = replicated_arguments.front()[i]->on_host_shape();
+    if (!ShapeUtil::Equal(shape_module, shape_arg)) {
+      return InvalidArgumentStrCat(
+          "The executable exepcts the ", i, "th argument in shape ",
+          ShapeUtil::HumanStringWithLayout(shape_module), " but sees ",
+          ShapeUtil::HumanStringWithLayout(shape_arg));
+    }
+  }
+
   TF_ASSIGN_OR_RETURN(auto stream,
                       execute_backend_->BorrowStream(
                           execute_backend_->default_stream_executor()));
@@ -884,9 +891,10 @@ Status Service::ExecuteGraph(const ExecuteGraphRequest* arg,
 
   TF_ASSIGN_OR_RETURN(
       *result->mutable_output(),
-      ExecuteAndRegisterResult(
-          executable.get(), replicated_arguments, execute_backend_.get(),
-          "result of " + arg->computation().name(), result->mutable_profile()));
+      ExecuteAndRegisterResult(executable.get(), replicated_arguments,
+                               execute_backend_.get(),
+                               "result of " + executable->module().name(),
+                               result->mutable_profile()));
 
   if (executable->dumping_snapshot()) {
     TF_ASSIGN_OR_RETURN(
@@ -898,7 +906,7 @@ Status Service::ExecuteGraph(const ExecuteGraphRequest* arg,
     TF_RETURN_IF_ERROR(executable->DumpHloSnapshot());
   }
 
-  VLOG(1) << "successfully completed 'execute-graph' request";
+  VLOG(1) << "successfully completed 'execute' request";
   return Status::OK();
 }
 
diff --git a/tensorflow/compiler/xla/service/service.h b/tensorflow/compiler/xla/service/service.h
index 8cf1a7b9f01..11e1a79552f 100644
--- a/tensorflow/compiler/xla/service/service.h
+++ b/tensorflow/compiler/xla/service/service.h
@@ -22,11 +22,12 @@ limitations under the License.
 #include <vector>
 
 #include "absl/types/span.h"
+#include "tensorflow/compiler/xla/debug_options_flags.h"
 #include "tensorflow/compiler/xla/executable_run_options.h"
-#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/service/allocation_tracker.h"
 #include "tensorflow/compiler/xla/service/backend.h"
 #include "tensorflow/compiler/xla/service/channel_tracker.h"
+#include "tensorflow/compiler/xla/service/compilation_cache.h"
 #include "tensorflow/compiler/xla/service/device_memory_allocator.h"
 #include "tensorflow/compiler/xla/service/executable.h"
 #include "tensorflow/compiler/xla/service/execution_tracker.h"
@@ -90,11 +91,14 @@ class Service : public ServiceInterface {
   Status DeconstructTuple(const DeconstructTupleRequest* arg,
                           DeconstructTupleResponse* result) override;
 
-  // Executes a computation with the provided global data passed as
-  // immutable arguments. The request contains the whole computation graph.
-  // Returns global data output and execution timing.
-  Status ExecuteGraph(const ExecuteGraphRequest* arg,
-                      ExecuteResponse* result) override;
+  // Compiles a computation into an executable. The request contains the whole
+  // computation graph. Returns the handle to the executable.
+  Status Compile(const CompileRequest* arg, CompileResponse* result) override;
+
+  // Executes an executable with the provided global data passes as immutable
+  // arguments. The request contains the handle to the executable. Returns
+  // global data output and execution timing.
+  Status Execute(const ExecuteRequest* arg, ExecuteResponse* result) override;
 
   // Executes one or more computations in parallel with the provided global data
   // passed as immutable arguments. Returns global data output for each
@@ -179,10 +183,6 @@ class Service : public ServiceInterface {
       absl::Span<const ShapedBuffer* const> arguments,
       const ExecutionOptions& execution_options);
 
-  // Picks a parallel response and fills the result.
-  Status PickParallelResponse(const ExecuteParallelResponse& parallel_result,
-                              ExecuteResponse* result);
-
   // Prepare the executors for executing parallel.
   StatusOr<std::vector<se::StreamExecutor*>> GetExecutors(
       const ExecutionOptions& execution_options, int64 requests_size,
@@ -254,11 +254,6 @@ class Service : public ServiceInterface {
       Backend* backend, absl::Span<const DeviceHandle> device_handles,
       absl::Span<const string> result_tags, ExecutionProfile* profile);
 
-  // Executes a single computation which has more than one target device.
-  // The N devices are expected to all return an empty tuple, but one, which
-  // will be the result of this computation.
-  Status ExecuteOneToN(const ExecuteGraphRequest* arg, ExecuteResponse* result);
-
   // Convenience function which checks whether the given client_shape
   // (presumably passed by the client to set the result layout) is valid for the
   // given computation result shape.
@@ -281,6 +276,9 @@ class Service : public ServiceInterface {
 
   ServiceOptions options_;
 
+  // Cache containing previously built Executables.
+  CompilationCache compilation_cache_;
+
   // Tracks channels created via the API.
   ChannelTracker channel_tracker_;
 
diff --git a/tensorflow/compiler/xla/service/shape_inference.cc b/tensorflow/compiler/xla/service/shape_inference.cc
index 2f8f092303e..61a60ef9efa 100644
--- a/tensorflow/compiler/xla/service/shape_inference.cc
+++ b/tensorflow/compiler/xla/service/shape_inference.cc
@@ -2031,6 +2031,16 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
   return operand_shape;
 }
 
+/* static */ StatusOr<Shape> ShapeInference::InferGetDimensionSizeShape(
+    const Shape& shape, int64 dimension) {
+  if (dimension < 0 || dimension >= ShapeUtil::Rank(shape)) {
+    return InvalidArgument("GetDimensionSize dimension out of bounds: %d.",
+                           dimension);
+  }
+
+  return ShapeUtil::MakeShape(S64, {});
+}
+
 /* static */ StatusOr<Shape> ShapeInference::InferSliceShape(
     const Shape& arg, absl::Span<const int64> starts,
     absl::Span<const int64> limits, absl::Span<const int64> strides) {
@@ -2833,6 +2843,15 @@ Status ValidateScatterDimensionNumbers(
     }
   }
 
+  // Validate window size.
+  auto window_size = dim_numbers.update_window_dims_size() +
+                     dim_numbers.inserted_window_dims_size();
+  if (window_size != ShapeUtil::Rank(operand_shape)) {
+    return InvalidArgument(
+        "Scatter op has window of size %d; doesn't match operand of rank %d.",
+        window_size, ShapeUtil::Rank(operand_shape));
+  }
+
   // Validate scatter_dims_to_operand_dims in ScatterDimensionNumbers.
   if (dim_numbers.scatter_dims_to_operand_dims_size() !=
       scatter_indices_shape[dim_numbers.index_vector_dim()]) {
diff --git a/tensorflow/compiler/xla/service/shape_inference.h b/tensorflow/compiler/xla/service/shape_inference.h
index cd4e5ab52ca..31ef4b2e410 100644
--- a/tensorflow/compiler/xla/service/shape_inference.h
+++ b/tensorflow/compiler/xla/service/shape_inference.h
@@ -291,6 +291,9 @@ class ShapeInference {
       const Shape& updates_shape, const ProgramShape& to_apply_shape,
       const ScatterDimensionNumbers& scatter_dim_numbers);
 
+  static StatusOr<Shape> InferGetDimensionSizeShape(const Shape& shape,
+                                                    int64 dimension);
+
  private:
   // Helper that infers the shape produced by performing an element-wise binary
   // operation with the given LHS and RHS shapes.
diff --git a/tensorflow/compiler/xla/service/shape_inference_test.cc b/tensorflow/compiler/xla/service/shape_inference_test.cc
index 7b65e8c1c9d..4639e32db4d 100644
--- a/tensorflow/compiler/xla/service/shape_inference_test.cc
+++ b/tensorflow/compiler/xla/service/shape_inference_test.cc
@@ -2673,5 +2673,23 @@ TEST_F(ScatterGatherShapeInferenceTest,
       << statusor.status();
 }
 
+TEST_F(ScatterGatherShapeInferenceTest,
+       InvalidScatterDimNumbers_InsufficientWindowDims) {
+  StatusOr<Shape> statusor = ShapeInference::InferScatterShape(
+      f32_5d_tensor_50_49_48_47_46_, s64_scalar_,
+      ShapeUtil::MakeShape(F32, {30, 29, 28, 27}), to_apply_,
+      HloScatterInstruction::MakeScatterDimNumbers(
+          /*update_window_dims=*/{0, 1, 2, 3},
+          /*inserted_window_dims=*/{},
+          /*scatter_dims_to_operand_dims=*/{0},
+          /*index_vector_dim=*/0));
+  ASSERT_FALSE(statusor.ok());
+  EXPECT_THAT(
+      statusor.status().error_message(),
+      HasSubstr(
+          "Scatter op has window of size 4; doesn't match operand of rank 5."))
+      << statusor.status();
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/shaped_buffer.cc b/tensorflow/compiler/xla/service/shaped_buffer.cc
index 56952e3adae..28a30b5ee2d 100644
--- a/tensorflow/compiler/xla/service/shaped_buffer.cc
+++ b/tensorflow/compiler/xla/service/shaped_buffer.cc
@@ -157,4 +157,23 @@ void ScopedShapedBuffer::Deallocate() {
   }
 }
 
+ScopedShapedBuffer ScopedShapedBuffer::TakeSubTree(ShapeIndexView index) {
+  const xla::Shape& sub_on_host_shape =
+      xla::ShapeUtil::GetSubshape(on_host_shape(), {index});
+  const xla::Shape& sub_on_device_shape =
+      xla::ShapeUtil::GetSubshape(on_device_shape(), {index});
+
+  ScopedShapedBuffer output(sub_on_host_shape, sub_on_device_shape,
+                            memory_allocator(), device_ordinal());
+  auto src_it = buffers().find(index);
+  auto dst_it = output.buffers().begin();
+  while (dst_it != output.buffers().end()) {
+    dst_it->second = src_it->second;
+    src_it->second = tensorflow::se::DeviceMemoryBase(nullptr, 0);
+    ++src_it;
+    ++dst_it;
+  }
+  return output;
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/shaped_buffer.h b/tensorflow/compiler/xla/service/shaped_buffer.h
index e1d26da4a20..f5210c9cfa6 100644
--- a/tensorflow/compiler/xla/service/shaped_buffer.h
+++ b/tensorflow/compiler/xla/service/shaped_buffer.h
@@ -176,6 +176,11 @@ class ScopedShapedBuffer : public ShapedBuffer {
   // It's the caller's job to ensure that the memory contained therein is freed.
   TF_MUST_USE_RESULT ShapedBuffer release();
 
+  // Extracts the sub-tree rooted at 'index' and returns a ScopedShapedBuffer
+  // that holds ownership of the subtree. Sets the buffers corresponding to the
+  // subtree to null in 'this'.
+  ScopedShapedBuffer TakeSubTree(ShapeIndexView index);
+
  protected:
   void Deallocate();
 
diff --git a/tensorflow/compiler/xla/service/shaped_buffer_test.cc b/tensorflow/compiler/xla/service/shaped_buffer_test.cc
index d69e6362e91..ca64bd3c8dd 100644
--- a/tensorflow/compiler/xla/service/shaped_buffer_test.cc
+++ b/tensorflow/compiler/xla/service/shaped_buffer_test.cc
@@ -20,6 +20,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/platform_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/core/platform/stream_executor_no_cuda.h"
+#include "tensorflow/core/platform/test_benchmark.h"
 #include "tensorflow/core/util/ptr_util.h"
 
 namespace xla {
@@ -107,5 +109,79 @@ TEST(ScopedShapedBufferTest, TestMoveAssignmentOperator) {
   // TestAllocator's destructor checks that all memory was freed.
 }
 
+TEST(ScopedShapedBufferTest, TestTakeSubTree) {
+  TestAllocator allocator;
+
+  Shape s = ShapeUtil::MakeShape(F32, {1});
+  s = xla::ShapeUtil::MakeTupleShape(std::vector<xla::Shape>(2, s));
+  s = xla::ShapeUtil::MakeTupleShape(std::vector<xla::Shape>(3, s));
+
+  ScopedShapedBuffer sb(s, s, &allocator, /*device_ordinal=*/0);
+  sb.buffers().ForEachMutableElement(
+      [&](const xla::ShapeIndex& index, se::DeviceMemoryBase* buffer) {
+        TF_ASSERT_OK_AND_ASSIGN(
+            OwningDeviceMemory m,
+            allocator.Allocate(/*device_ordinal=*/0, /*size=*/77));
+        *buffer = m.Forget();
+      });
+  ShapeTree<se::DeviceMemoryBase> buffers = sb.buffers();
+
+  // Takes a subtree out of 'sb', and verifies the buffers are as expected.
+  xla::ShapeIndex subtree_index = {1};
+  ScopedShapedBuffer output = sb.TakeSubTree(subtree_index);
+
+  output.buffers().ForEachElement([&](const xla::ShapeIndex& sub_index,
+                                      const se::DeviceMemoryBase& buffer) {
+    xla::ShapeIndex orig_index = subtree_index;
+    for (int i : sub_index) {
+      orig_index.push_back(i);
+    }
+    EXPECT_TRUE(buffers.find(orig_index)->second.IsSameAs(buffer));
+  });
+  sb.buffers().ForEachElement(
+      [&](const xla::ShapeIndex& index, const se::DeviceMemoryBase& buffer) {
+        if (ShapeIndexView(index).StartsWith(subtree_index)) {
+          EXPECT_TRUE(buffer.is_null());
+        } else {
+          EXPECT_TRUE(buffers.find(index)->second.IsSameAs(buffer));
+        }
+      });
+}
+
+// Test TakeSubTree with different depths (depth of ShapeTree) and fan-outs
+// (cardinality of each non-leaf node's children).
+void BM_TakeSubTree(int iters, int depth, int fan_out) {
+  tensorflow::testing::StopTiming();
+  TestAllocator allocator;
+  xla::Shape shape = xla::ShapeUtil::MakeShape(xla::F32, {32, 64, 128});
+  for (int i = 0; i < depth; ++i) {
+    std::vector<xla::Shape> shapes(fan_out, shape);
+    shape = xla::ShapeUtil::MakeTupleShape(shapes);
+  }
+  xla::ScopedShapedBuffer shaped_buffer(shape, shape, /*allocator=*/&allocator,
+                                        /*device_ordinal=*/0);
+  tensorflow::testing::StartTiming();
+  for (int i = 0; i < iters; ++i) {
+    // Extract a buffer from approximately the middle of the first level of the
+    // tree.
+    (void)shaped_buffer.TakeSubTree(/*index=*/{fan_out / 2}).release();
+  }
+  tensorflow::testing::StopTiming();
+}
+
+BENCHMARK(BM_TakeSubTree)
+    ->ArgPair(1, 4)
+    ->ArgPair(1, 8)
+    ->ArgPair(1, 32)
+    ->ArgPair(1, 64)
+    ->ArgPair(1, 128)
+    ->ArgPair(1, 256)
+    ->ArgPair(1, 512)
+    ->ArgPair(2, 4)
+    ->ArgPair(2, 8)
+    ->ArgPair(2, 32)
+    ->ArgPair(2, 64)
+    ->ArgPair(2, 128);
+
 }  // anonymous namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/transpose_folding_test.cc b/tensorflow/compiler/xla/service/transpose_folding_test.cc
index 79b5c09abb3..7a565bf0768 100644
--- a/tensorflow/compiler/xla/service/transpose_folding_test.cc
+++ b/tensorflow/compiler/xla/service/transpose_folding_test.cc
@@ -172,7 +172,7 @@ TEST_F(TransposeFoldingTest, FuseDotWithConstantOperands) {
   HloInstruction* mul = builder.AddInstruction(HloInstruction::CreateBinary(
       add->shape(), HloOpcode::kMultiply, add, sub));
 
-  auto module = CreateNewModule("fuse_with_constant_operands");
+  auto module = CreateNewUnverifiedModule("fuse_with_constant_operands");
   HloComputation* entry_computation =
       module->AddEntryComputation(builder.Build(mul));
   HloInstruction* call = module->OutlineExpressionFromComputation(
@@ -247,7 +247,7 @@ TEST_F(TransposeFoldingTest, FoldConvDimSwapTransposeRhs) {
       conv_shape.ValueOrDie(), x, transpose_y,
       /*feature_group_count=*/1, window, dnums, DefaultPrecisionConfig(2)));
 
-  auto module = CreateNewModule("test_module");
+  auto module = CreateNewUnverifiedModule("test_module");
   HloComputation* entry_computation =
       module->AddEntryComputation(builder.Build(conv));
   FoldTranspose(module.get());
@@ -302,7 +302,7 @@ TEST_F(TransposeFoldingTest, FoldConvComplexTransposeRhs) {
       conv_shape.ValueOrDie(), x, transpose_y,
       /*feature_group_count=*/1, window, dnums, DefaultPrecisionConfig(2)));
 
-  auto module = CreateNewModule("test_module");
+  auto module = CreateNewUnverifiedModule("test_module");
   HloComputation* entry_computation =
       module->AddEntryComputation(builder.Build(conv));
   FoldTranspose(module.get());
@@ -362,7 +362,7 @@ TEST_F(TransposeFoldingTest, FoldConvTransposeLhs) {
       conv_shape.ValueOrDie(), transpose_x, y,
       /*feature_group_count=*/1, window, dnums, DefaultPrecisionConfig(2)));
 
-  auto module = CreateNewModule("test_module");
+  auto module = CreateNewUnverifiedModule("test_module");
   HloComputation* entry_computation =
       module->AddEntryComputation(builder.Build(conv));
   FoldTranspose(module.get());
@@ -428,7 +428,7 @@ TEST_F(TransposeFoldingTest, FoldConvComplexTransposeLhs) {
       conv_shape.ValueOrDie(), transpose_x, y,
       /*feature_group_count=*/1, window, dnums, DefaultPrecisionConfig(2)));
 
-  auto module = CreateNewModule("test_module");
+  auto module = CreateNewUnverifiedModule("test_module");
   HloComputation* entry_computation =
       module->AddEntryComputation(builder.Build(conv));
   FoldTranspose(module.get());
diff --git a/tensorflow/compiler/xla/service/tuple_points_to_analysis_test.cc b/tensorflow/compiler/xla/service/tuple_points_to_analysis_test.cc
index d9ebebf74ed..10ef2d38fa2 100644
--- a/tensorflow/compiler/xla/service/tuple_points_to_analysis_test.cc
+++ b/tensorflow/compiler/xla/service/tuple_points_to_analysis_test.cc
@@ -48,7 +48,7 @@ class TuplePointsToAnalysisTest : public HloTestBase {
   }
 
   void BuildModule(std::unique_ptr<HloComputation> computation) {
-    module_ = CreateNewModule();
+    module_ = CreateNewUnverifiedModule();
     module_->AddEntryComputation(std::move(computation));
   }
 
@@ -809,7 +809,7 @@ TEST_F(FusionPointsToAnalysisTest, FusionParam0TwoUsers) {
 class PointsToAnalysisTestBase : public HloTestBase {
  protected:
   void BuildModule(std::unique_ptr<HloComputation> computation) {
-    module_ = CreateNewModule();
+    module_ = CreateNewUnverifiedModule();
     computation_ = module_->AddEntryComputation(std::move(computation));
   }
 
@@ -1176,7 +1176,7 @@ TEST_F(CanShareOperandBufferWithUserTest, WhileCanShare) {
     return builder.Build();
   };
 
-  module_ = CreateNewModule();
+  module_ = CreateNewUnverifiedModule();
   HloComputation* cond_computation =
       module_->AddEmbeddedComputation(make_cond());
   HloComputation* body_computation =
@@ -1211,7 +1211,7 @@ TEST_F(CanShareOperandBufferWithUserTest, CallToComputationWithFusionRoot) {
   auto add = sub_builder.AddInstruction(
       HloInstruction::CreateBinary(shape, HloOpcode::kAdd, sub_param, ones));
 
-  module_ = CreateNewModule();
+  module_ = CreateNewUnverifiedModule();
   auto sub_computation = module_->AddEmbeddedComputation(sub_builder.Build());
   sub_computation->CreateFusionInstruction({add, ones},
                                            HloInstruction::FusionKind::kLoop);
diff --git a/tensorflow/compiler/xla/service/tuple_simplifier_test.cc b/tensorflow/compiler/xla/service/tuple_simplifier_test.cc
index 516754e2110..65b0f8c8044 100644
--- a/tensorflow/compiler/xla/service/tuple_simplifier_test.cc
+++ b/tensorflow/compiler/xla/service/tuple_simplifier_test.cc
@@ -25,7 +25,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
-#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 
@@ -34,7 +34,7 @@ namespace op = xla::testing::opcode_matchers;
 namespace xla {
 namespace {
 
-class TupleSimplifierTest : public HloVerifiedTestBase {
+class TupleSimplifierTest : public HloTestBase {
  protected:
   void Run(HloModule* module, bool change_expected) {
     TupleSimplifier simplifier;
@@ -65,10 +65,10 @@ TEST_F(TupleSimplifierTest, TupleOfParameters) {
   HloInstruction* param2 = builder.AddInstruction(
       HloInstruction::CreateParameter(2, scalar_shape_, "param2"));
   builder.AddInstruction(HloInstruction::CreateTuple({param0, param1, param2}));
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
-  Run(module, /*change_expected=*/false);
+  Run(module.get(), /*change_expected=*/false);
 }
 
 TEST_F(TupleSimplifierTest, GteOfTupleOfParameter) {
@@ -78,10 +78,10 @@ TEST_F(TupleSimplifierTest, GteOfTupleOfParameter) {
       HloInstruction::CreateParameter(0, tuple_shape_, "param"));
   builder.AddInstruction(
       HloInstruction::CreateGetTupleElement(scalar_shape_, param, 1));
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
-  Run(module, /*change_expected=*/false);
+  Run(module.get(), /*change_expected=*/false);
 }
 
 TEST_F(TupleSimplifierTest, GteOfTuple) {
@@ -98,12 +98,12 @@ TEST_F(TupleSimplifierTest, GteOfTuple) {
   HloInstruction* gte = builder.AddInstruction(
       HloInstruction::CreateGetTupleElement(scalar_shape_, tuple, 1));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(), gte);
 
-  Run(module, /*change_expected=*/true);
+  Run(module.get(), /*change_expected=*/true);
 
   EXPECT_THAT(computation->root_instruction(), param1);
 }
@@ -125,13 +125,13 @@ TEST_F(TupleSimplifierTest, GteOfTupleChain) {
   builder.AddInstruction(
       HloInstruction::CreateUnary(scalar_shape_, HloOpcode::kNegate, element));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Negate(op::GetTupleElement(op::Tuple())));
 
-  Run(module, /*change_expected=*/true);
+  Run(module.get(), /*change_expected=*/true);
 
   EXPECT_THAT(computation->root_instruction(), op::Negate(op::Parameter()));
 }
@@ -157,12 +157,12 @@ TEST_F(TupleSimplifierTest, NestedGteOfTuples) {
         ShapeUtil::GetTupleElementShape(element->shape(), 0), element, 0));
   }
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(), element);
 
-  Run(module, /*change_expected=*/true);
+  Run(module.get(), /*change_expected=*/true);
 
   EXPECT_THAT(computation->root_instruction(), param);
 }
@@ -182,12 +182,12 @@ TEST_F(TupleSimplifierTest, TupleOfGteInstructions) {
   HloInstruction* tuple =
       builder.AddInstruction(HloInstruction::CreateTuple({gte0, gte1, gte2}));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(), tuple);
 
-  Run(module, /*change_expected=*/true);
+  Run(module.get(), /*change_expected=*/true);
 
   EXPECT_THAT(computation->root_instruction(), tuple_param);
 }
@@ -207,19 +207,19 @@ TEST_F(TupleSimplifierTest, IncompatibleTuples) {
   HloInstruction* tuple =
       builder.AddInstruction(HloInstruction::CreateTuple({gte0, gte1}));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(), tuple);
 
-  Run(module, /*change_expected=*/false);
+  Run(module.get(), /*change_expected=*/false);
 
   EXPECT_THAT(computation->root_instruction(), tuple);
 }
 
 TEST_F(TupleSimplifierTest, CanExcludeEntryComputation) {
   //  Verify that the root computation can be excluded
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
 
   HloInstruction* p0;
   HloInstruction* p1;
@@ -281,7 +281,7 @@ TEST_F(TupleSimplifierTest, CanExcludeEntryComputation) {
     entry = module->AddEntryComputation(builder.Build());
   }
 
-  Run(module, /*change_expected=*/true, /*exclude_entry=*/true);
+  Run(module.get(), /*change_expected=*/true, /*exclude_entry=*/true);
 
   EXPECT_THAT(c0->root_instruction(), p0);
   EXPECT_THAT(c1->root_instruction(), p1);
diff --git a/tensorflow/compiler/xla/service/while_loop_analysis.cc b/tensorflow/compiler/xla/service/while_loop_analysis.cc
index 541b117e029..68e2569f66b 100644
--- a/tensorflow/compiler/xla/service/while_loop_analysis.cc
+++ b/tensorflow/compiler/xla/service/while_loop_analysis.cc
@@ -15,6 +15,9 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/while_loop_analysis.h"
 #include "tensorflow/compiler/xla/service/hlo_evaluator.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 
 namespace xla {
 
@@ -229,4 +232,96 @@ optional<int64> ComputeWhileLoopTripCount(HloInstruction* while_op,
   return nullopt;
 }
 
+// If the only user of this instruction is a get-tuple-element, return that
+// get-tuple-element, otherwise return null. If this runs before CSE/DCE, we may
+// get a false negative if there are several copies of the same GTE, or there
+// are unused GTEs, but we can live with this.
+static HloInstruction* GetOnlyGTE(HloInstruction* inst) {
+  if (inst->user_count() != 1) {
+    return nullptr;
+  }
+
+  HloInstruction* user = inst->users().back();
+  if (user->opcode() != HloOpcode::kGetTupleElement) {
+    return nullptr;
+  }
+  return user;
+}
+
+optional<int64> ComputeWhileLoopTripCountUpperBound(HloInstruction* while_op) {
+  // If we know the exact trip count, it's also the upper bound.
+  auto exact_trip_count = ComputeWhileLoopTripCount(while_op);
+  if (exact_trip_count) {
+    VLOG(2) << "Loop has exact trip count.";
+    return exact_trip_count;
+  }
+
+  // There is one more case we know how to handle. If the loop condition only
+  // looks at one element of the tuple, and the loop body sets this element to a
+  // constant, there are two options:
+  // 1) Evaluating the condition on this constant returns true. In this case,
+  // the loop either executes 0 times, or is an infinite loop, depending on the
+  // init value.
+  // 2) Evaluating the condition on this constant returns false. In this case,
+  // the loop executes 0 or 1 times, depending on the init value. This means
+  // that, regardless of the init value, the upper bound on the trip count is 1.
+
+  // Check whether the condition depends on a single parameter, and find out
+  // which.
+  auto* while_cond = while_op->while_condition();
+  auto* while_cond_param = while_cond->parameter_instruction(0);
+  auto* cond_gte = GetOnlyGTE(while_cond_param);
+  if (!cond_gte) {
+    VLOG(2) << "Induction variable not found in loop condition: "
+            << while_cond->root_instruction()->ToString();
+    return nullopt;
+  }
+
+  // Now check whether this gets set to a constant by the while body.
+  auto* while_body = while_op->while_body();
+  auto* while_body_root = while_body->root_instruction();
+  if (while_body_root->opcode() != HloOpcode::kTuple) {
+    VLOG(3) << "While body's root is not a tuple instruction: "
+            << while_body_root->ToString();
+    return nullopt;
+  }
+
+  int64 indvar_index = cond_gte->tuple_index();
+  auto* while_body_indvar = while_body_root->operand(indvar_index);
+  if (while_body_indvar->opcode() != HloOpcode::kConstant) {
+    VLOG(3) << "While body does not set the IV to a constant: "
+            << while_body_indvar->ToString();
+    return nullopt;
+  }
+
+  // We have a constant. Evaluate the condition on this constant.
+  HloEvaluator evaluator(/*max_loop_iterations=*/0);
+  Literal fake_input = Literal::CreateFromShape(while_cond_param->shape());
+  TF_CHECK_OK(fake_input.CopyFrom(while_body_indvar->literal(),
+                                  /*dest_shape_index=*/{indvar_index},
+                                  /*src_shape_index=*/{}));
+  StatusOr<Literal> eval_result =
+      evaluator.Evaluate<Literal>(*while_cond, {std::move(fake_input)});
+
+  if (!eval_result.ok()) {
+    VLOG(2) << "Couldn't evaluate while loop condition.";
+    return nullopt;
+  }
+
+  Literal cond_result_pred = std::move(eval_result.ValueOrDie());
+  CHECK(ShapeUtil::Equal(cond_result_pred.shape(),
+                         ShapeUtil::MakeShape(PRED, {})));
+
+  // Per the explanation above, if the evaluated condition returns false, the
+  // loop executes at most once.
+  bool cond_returns_true = cond_result_pred.GetFirstElement<bool>();
+  if (!cond_returns_true) {
+    VLOG(2) << "Upper bound on the trip count is 1";
+    return 1;
+  }
+
+  VLOG(2) << "Loop has no known upper bound on the trip count.";
+  return nullopt;
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/while_loop_analysis.h b/tensorflow/compiler/xla/service/while_loop_analysis.h
index bf497f4892b..ac69a727bd6 100644
--- a/tensorflow/compiler/xla/service/while_loop_analysis.h
+++ b/tensorflow/compiler/xla/service/while_loop_analysis.h
@@ -28,6 +28,10 @@ namespace xla {
 absl::optional<int64> ComputeWhileLoopTripCount(HloInstruction *while_op,
                                                 int64 max_value_returned = 128);
 
+// Returns an upper bound on the trip count of the loop if it's statically
+// known, nullopt otherwise.
+absl::optional<int64> ComputeWhileLoopTripCountUpperBound(
+    HloInstruction *while_op);
 }  // namespace xla
 
 #endif  // TENSORFLOW_COMPILER_XLA_SERVICE_WHILE_LOOP_ANALYSIS_H_
diff --git a/tensorflow/compiler/xla/service/while_loop_analysis_test.cc b/tensorflow/compiler/xla/service/while_loop_analysis_test.cc
new file mode 100644
index 00000000000..1da0fbeac89
--- /dev/null
+++ b/tensorflow/compiler/xla/service/while_loop_analysis_test.cc
@@ -0,0 +1,124 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/while_loop_analysis.h"
+
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+
+namespace xla {
+namespace {
+
+class WhileLoopAnalysisTest : public HloTestBase {};
+
+TEST_F(WhileLoopAnalysisTest, SingleIterationUpperBound) {
+  const char* const kHloModule = R"(
+    HloModule ModuleWithWhile
+
+    body {
+      p_body = (f32[2], s32[]) parameter(0)
+      val = f32[2] get-tuple-element(p_body), index=0
+      const = s32[] constant(-1)
+      ROOT root = (f32[2], s32[]) tuple(val, const)
+    }
+
+    condition {
+      p_cond = (f32[2], s32[]) parameter(0)
+      gte = s32[] get-tuple-element(p_cond), index=1
+      const = s32[] constant(42)
+      ROOT result = pred[] equal-to(gte, const)
+    }
+
+    ENTRY entry {
+      param.0 = f32[2] parameter(0)
+      param.1 = s32[] parameter(1)
+      while_init = (f32[2], s32[]) tuple(param.0, param.1)
+      ROOT while = (f32[2], s32[]) while(while_init), condition=condition, body=body
+    })";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(kHloModule));
+
+  HloInstruction* while_op = module->entry_computation()->root_instruction();
+  EXPECT_EQ(*ComputeWhileLoopTripCountUpperBound(while_op), 1);
+}
+
+TEST_F(WhileLoopAnalysisTest, NoUpperBound) {
+  const char* const kHloModule = R"(
+    HloModule ModuleWithWhile
+
+    body {
+      p_body = (f32[2], s32[]) parameter(0)
+      val = f32[2] get-tuple-element(p_body), index=0
+      const = s32[] constant(42)
+      ROOT root = (f32[2], s32[]) tuple(val, const)
+    }
+
+    condition {
+      p_cond = (f32[2], s32[]) parameter(0)
+      gte = s32[] get-tuple-element(p_cond), index=1
+      const = s32[] constant(42)
+      ROOT result = pred[] equal-to(gte, const)
+    }
+
+    ENTRY entry {
+      param.0 = f32[2] parameter(0)
+      param.1 = s32[] parameter(1)
+      while_init = (f32[2], s32[]) tuple(param.0, param.1)
+      ROOT while = (f32[2], s32[]) while(while_init), condition=condition, body=body
+    })";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(kHloModule));
+
+  HloInstruction* while_op = module->entry_computation()->root_instruction();
+  EXPECT_EQ(ComputeWhileLoopTripCountUpperBound(while_op), absl::nullopt);
+}
+
+TEST_F(WhileLoopAnalysisTest, ExactBound) {
+  const char* const kHloModule = R"(
+    HloModule ModuleWithWhile
+
+    body {
+      p_body = (f32[2], s32[]) parameter(0)
+      val = f32[2] get-tuple-element(p_body), index=0
+      index = s32[] get-tuple-element(p_body), index=1
+      one = s32[] constant(1)
+      inc = s32[] add(index, one)
+      ROOT root = (f32[2], s32[]) tuple(val, inc)
+    }
+
+    condition {
+      p_cond = (f32[2], s32[]) parameter(0)
+      gte = s32[] get-tuple-element(p_cond), index=1
+      const = s32[] constant(42)
+      ROOT result = pred[] less-than(gte, const)
+    }
+
+    ENTRY entry {
+      param.0 = f32[2] parameter(0)
+      param.1 = s32[] constant(0)
+      while_init = (f32[2], s32[]) tuple(param.0, param.1)
+      ROOT while = (f32[2], s32[]) while(while_init), condition=condition, body=body
+    })";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(kHloModule));
+
+  HloInstruction* while_op = module->entry_computation()->root_instruction();
+  EXPECT_EQ(*ComputeWhileLoopTripCountUpperBound(while_op), 42);
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/while_loop_constant_sinking.cc b/tensorflow/compiler/xla/service/while_loop_constant_sinking.cc
index 067cfcc17d6..8b381dec073 100644
--- a/tensorflow/compiler/xla/service/while_loop_constant_sinking.cc
+++ b/tensorflow/compiler/xla/service/while_loop_constant_sinking.cc
@@ -46,8 +46,9 @@ static Status ReplaceUsesWhileKeepingLoopInvariance(
   return Status::OK();
 }
 
-StatusOr<bool> WhileLoopConstantSinking::TrySinkingConstantsIntoWhileBody(
+StatusOr<bool> WhileLoopConstantSinking::TrySinkingConstantsIntoWhileLoop(
     HloInstruction* while_instr) {
+  HloComputation* while_cond = while_instr->while_condition();
   HloComputation* while_body = while_instr->while_body();
 
   const HloInstruction& init_value = *while_instr->operand(0);
@@ -57,24 +58,48 @@ StatusOr<bool> WhileLoopConstantSinking::TrySinkingConstantsIntoWhileBody(
 
   bool changed = false;
 
-  for (HloInstruction* invariant_gte :
-       WhileUtil::GetInvariantGTEsForWhileBody(*while_body)) {
-    int64 index = invariant_gte->tuple_index();
+  absl::flat_hash_map<int64, absl::InlinedVector<HloInstruction*, 1>>
+      conditional_gte_index_to_insts =
+          WhileUtil::GetGTEsMapForWhileConditional(*while_cond);
+  std::vector<HloInstruction*> invariant_body_gtes =
+      WhileUtil::GetInvariantGTEsForWhileBody(*while_body);
+
+  for (HloInstruction* invariant_body_gte : invariant_body_gtes) {
+    int64 index = invariant_body_gte->tuple_index();
     const HloInstruction& invariant_value = *init_value.operand(index);
 
-    // Should have at least one user that's not while_body_root.
-    if (invariant_gte->user_count() <= 1) {
+    // Original value should be a constant.
+    if (invariant_value.opcode() != HloOpcode::kConstant) {
       continue;
     }
 
-    if (invariant_value.opcode() == HloOpcode::kConstant) {
-      auto* constant_instr =
+    // Sink into the while_body.
+    // Should have at least one user that's not while_body_root.
+    if (invariant_body_gte->user_count() > 1) {
+      HloInstruction* constant_instr =
           while_body->AddInstruction(invariant_value.Clone(/*suffix=*/".sunk"));
       TF_RETURN_IF_ERROR(ReplaceUsesWhileKeepingLoopInvariance(
-          invariant_gte, constant_instr, while_body->root_instruction(),
+          invariant_body_gte, constant_instr, while_body->root_instruction(),
           index));
       changed = true;
     }
+
+    // Check if there is a corresponding GTE in while_conditional.
+    auto it = conditional_gte_index_to_insts.find(index);
+    if (it == conditional_gte_index_to_insts.end()) {
+      continue;
+    }
+
+    for (HloInstruction* invariant_cond_gte : it->second) {
+      // Should have at least one user.
+      if (invariant_cond_gte->user_count() > 0) {
+        HloInstruction* constant_instr = while_cond->AddInstruction(
+            invariant_value.Clone(/*suffix=*/".sunk"));
+        TF_RETURN_IF_ERROR(
+            invariant_cond_gte->ReplaceAllUsesWith(constant_instr));
+        changed = true;
+      }
+    }
   }
 
   return changed;
@@ -115,10 +140,8 @@ StatusOr<bool> WhileLoopConstantSinking::Run(HloModule* module) {
   }
 
   for (HloInstruction* while_instr : while_instrs) {
-    // We only sink into while loop bodies, but this can be extended to
-    // transform conditions as well.
     TF_ASSIGN_OR_RETURN(bool result,
-                        TrySinkingConstantsIntoWhileBody(while_instr));
+                        TrySinkingConstantsIntoWhileLoop(while_instr));
     changed |= result;
   }
 
diff --git a/tensorflow/compiler/xla/service/while_loop_constant_sinking.h b/tensorflow/compiler/xla/service/while_loop_constant_sinking.h
index 577bad6c706..a866bc1264b 100644
--- a/tensorflow/compiler/xla/service/while_loop_constant_sinking.h
+++ b/tensorflow/compiler/xla/service/while_loop_constant_sinking.h
@@ -23,8 +23,8 @@ limitations under the License.
 namespace xla {
 
 // Sinks while loop invariant values that happen to be constants into the while
-// loop body.  This is probably not a win in isolation but may unlock further
-// optimizations like constant folding.
+// loop body and conditional. This is probably not a win in isolation but may
+// unlock further optimizations like constant folding.
 //
 //   state = (..., const, ...)
 //   while (pred(state)) {
@@ -46,22 +46,19 @@ namespace xla {
 // tuple trivially loop invariant.  WhileLoopSimplifier will later get rid of
 // `v`.
 //
-// We only sink into while loop bodies, but this can be extended to transform
-// conditions as well.
-//
 // TODO(b/79121449):  We should also sink broadcasts of constants.
 class WhileLoopConstantSinking : public HloModulePass {
  public:
   ~WhileLoopConstantSinking() override = default;
 
   absl::string_view name() const override {
-    return "while-loop-invariant-code-motion";
+    return "while-loop-constant-sinking";
   }
 
   StatusOr<bool> Run(HloModule* module) override;
 
  private:
-  StatusOr<bool> TrySinkingConstantsIntoWhileBody(HloInstruction* while_instr);
+  StatusOr<bool> TrySinkingConstantsIntoWhileLoop(HloInstruction* while_instr);
 };
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/service/while_loop_constant_sinking_test.cc b/tensorflow/compiler/xla/service/while_loop_constant_sinking_test.cc
index d17b86fab5b..75d406435b6 100644
--- a/tensorflow/compiler/xla/service/while_loop_constant_sinking_test.cc
+++ b/tensorflow/compiler/xla/service/while_loop_constant_sinking_test.cc
@@ -242,5 +242,178 @@ ENTRY entry {
     }
   }
 }
+
+TEST_F(WhileLoopConstantSinkingTest, ConditionalSinkConstant) {
+  const char* const hlo_string = R"(
+HloModule ModuleWithWhile
+
+body {
+  p_body = (f32[],f32[]) parameter(0)
+  p_body.0 = f32[] get-tuple-element((f32[],f32[]) p_body), index=0
+  const = f32[] constant(1)
+  add = f32[] add(p_body.0, const)
+  p_body.1 = f32[] get-tuple-element((f32[],f32[]) p_body), index=1
+  ROOT root = (f32[],f32[]) tuple(add, p_body.1)
+}
+
+condition {
+  p_cond = (f32[],f32[]) parameter(0)
+  p_cond.0 = f32[] get-tuple-element((f32[],f32[]) p_cond), index=0
+  p_cond.1 = f32[] get-tuple-element((f32[],f32[]) p_cond), index=1
+  ROOT result = pred[] less-than(p_cond.0, p_cond.1)
+}
+
+ENTRY entry {
+  const_0 = f32[] constant(0)
+  const_1 = f32[] constant(10)
+  while_init = (f32[],f32[]) tuple(const_0, const_1)
+  ROOT while = (f32[],f32[]) while(while_init), condition=condition, body=body
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseHloString(hlo_string));
+
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          WhileLoopConstantSinking{}.Run(module.get()));
+  ASSERT_TRUE(changed);
+
+  auto* while_condition = module->GetComputationWithName("condition");
+  EXPECT_THAT(while_condition->root_instruction(), op::Lt(_, op::Constant()));
+}
+
+TEST_F(WhileLoopConstantSinkingTest, ConditionalTupleShapedConstants) {
+  const char* const hlo_string = R"(
+HloModule ModuleWithWhile
+
+body {
+  p_b = (f32[],(f32[],f32[])) parameter(0)
+  p_b.0 = f32[] get-tuple-element((f32[],(f32[],f32[])) p_b), index=0
+  p_b.1 = (f32[],f32[]) get-tuple-element((f32[],(f32[],f32[])) p_b), index=1
+  p_b.1.0 = f32[] get-tuple-element((f32[],f32[]) p_b.1), index=0
+  add = f32[] add(p_b.0, p_b.1.0)
+  ROOT root = (f32[],(f32[],f32[])) tuple(add, p_b.1)
+}
+
+condition {
+  p_c = (f32[],(f32[],f32[])) parameter(0)
+  p_c.0 = f32[] get-tuple-element((f32[],(f32[],f32[])) p_c), index=0
+  p_c.1 = (f32[],f32[]) get-tuple-element((f32[],(f32[],f32[])) p_c), index=1
+  p_c.1.1 = f32[] get-tuple-element((f32[],f32[]) p_c.1), index=1
+  ROOT result = pred[] less-than(p_c.0, p_c.1.1)
+}
+
+ENTRY entry {
+  const_0 = f32[] constant(0)
+  const_1 = (f32[], f32[]) constant((f32[], f32[]) (1, 10))
+  while_init = (f32[],(f32[],f32[])) tuple(const_0, const_1)
+  ROOT while = (f32[],(f32[],f32[])) while(while_init), condition=condition, body=body
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseHloString(hlo_string));
+
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          WhileLoopConstantSinking{}.Run(module.get()));
+  ASSERT_TRUE(changed);
+
+  auto* while_condition = module->GetComputationWithName("condition");
+  EXPECT_THAT(while_condition->root_instruction(),
+              op::Lt(_, op::GetTupleElement(op::Constant())));
+}
+
+TEST_F(WhileLoopConstantSinkingTest, ConditionalDontCreateDeadConstant) {
+  const char* const hlo_string = R"(
+HloModule ModuleWithWhile
+
+body {
+  p_body = (f32[],f32[],f32[]) parameter(0)
+  p_body.0 = f32[] get-tuple-element((f32[],f32[],f32[]) p_body), index=0
+  const = f32[] constant(1)
+  add = f32[] add(p_body.0, const)
+  p_body.1 = f32[] get-tuple-element((f32[],f32[],f32[]) p_body), index=1
+  p_body.2 = f32[] get-tuple-element((f32[],f32[],f32[]) p_body), index=2
+  ROOT root = (f32[],f32[],f32[]) tuple(add, p_body.1, p_body.2)
+}
+
+condition {
+  p_cond = (f32[],f32[],f32[]) parameter(0)
+  p_cond.0 = f32[] get-tuple-element((f32[],f32[],f32[]) p_cond), index=0
+  p_cond.1 = f32[] get-tuple-element((f32[],f32[],f32[]) p_cond), index=1
+  p_cond.2 = f32[] get-tuple-element((f32[],f32[],f32[]) p_cond), index=2
+  ROOT result = pred[] less-than(p_cond.0, p_cond.1)
+}
+
+ENTRY entry {
+  const_0 = f32[] constant(0)
+  const_1 = f32[] constant(10)
+  const_2 = f32[] constant(12)
+  while_init = (f32[],f32[],f32[]) tuple(const_0, const_1, const_2)
+  ROOT while = (f32[],f32[],f32[]) while(while_init), condition=condition, body=body
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseHloString(hlo_string));
+
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          WhileLoopConstantSinking{}.Run(module.get()));
+  ASSERT_TRUE(changed);
+
+  auto* while_condition = module->GetComputationWithName("condition");
+  EXPECT_THAT(while_condition->root_instruction(), op::Lt(_, op::Constant()));
+  for (const HloInstruction* inst : while_condition->instructions()) {
+    if (inst->opcode() == HloOpcode::kConstant) {
+      EXPECT_GT(inst->user_count(), 0);
+    }
+  }
+}
+
+TEST_F(WhileLoopConstantSinkingTest, ConditionalMultipleSameIndexGTEs) {
+  const char* const hlo_string = R"(
+HloModule ModuleWithWhile
+
+body {
+  p_body = (f32[],f32[],f32[]) parameter(0)
+  p_body.0 = f32[] get-tuple-element((f32[],f32[],f32[]) p_body), index=0
+  const = f32[] constant(1)
+  add.0 = f32[] add(p_body.0, const)
+  p_body.1 = f32[] get-tuple-element((f32[],f32[],f32[]) p_body), index=1
+  add.1 = f32[] add(p_body.1, const)
+  p_body.2 = f32[] get-tuple-element((f32[],f32[],f32[]) p_body), index=2
+  ROOT root = (f32[],f32[],f32[]) tuple(add.0, add.1, p_body.2)
+}
+
+condition {
+  p_cond = (f32[],f32[],f32[]) parameter(0)
+  p_cond.0 = f32[] get-tuple-element((f32[],f32[],f32[]) p_cond), index=0
+  p_cond.2 = f32[] get-tuple-element((f32[],f32[],f32[]) p_cond), index=2
+  lt.0 = pred[] less-than(p_cond.0, p_cond.2)
+  p_cond.1 = f32[] get-tuple-element((f32[],f32[],f32[]) p_cond), index=1
+  p_cond.2.c = f32[] get-tuple-element((f32[],f32[],f32[]) p_cond), index=2
+  lt.1 = pred[] less-than(p_cond.1, p_cond.2.c)
+  ROOT result = pred[] and(lt.0, lt.1)
+}
+
+ENTRY entry {
+  const_0 = f32[] constant(0)
+  const_1 = f32[] constant(0)
+  const_2 = f32[] constant(12)
+  while_init = (f32[],f32[],f32[]) tuple(const_0, const_1, const_2)
+  ROOT while = (f32[],f32[],f32[]) while(while_init), condition=condition, body=body
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseHloString(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          WhileLoopConstantSinking{}.Run(module.get()));
+  ASSERT_TRUE(changed);
+
+  auto* while_condition = module->GetComputationWithName("condition");
+  EXPECT_THAT(while_condition->root_instruction(),
+              op::And(op::Lt(_, op::Constant()), op::Lt(_, op::Constant())));
+}
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/while_loop_invariant_code_motion.cc b/tensorflow/compiler/xla/service/while_loop_invariant_code_motion.cc
index 9795b2830b6..b7c28bfac78 100644
--- a/tensorflow/compiler/xla/service/while_loop_invariant_code_motion.cc
+++ b/tensorflow/compiler/xla/service/while_loop_invariant_code_motion.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "absl/container/flat_hash_set.h"
 #include "absl/container/inlined_vector.h"
 #include "tensorflow/compiler/xla/service/tuple_util.h"
+#include "tensorflow/compiler/xla/service/while_loop_analysis.h"
 #include "tensorflow/compiler/xla/service/while_util.h"
 #include "tensorflow/compiler/xla/util.h"
 
@@ -143,6 +144,12 @@ WhileLoopInvariantCodeMotion::TryHoistingInvariantInstructionsFromWhileBody(
   string while_instr_name = while_instr->ToString(print_no_metadata);
   VLOG(2) << "Trying to hoist from " << while_instr_name;
 
+  auto maybe_upper_bound = ComputeWhileLoopTripCountUpperBound(while_instr);
+  if (maybe_upper_bound && *maybe_upper_bound <= 1) {
+    VLOG(2) << "Loop has a trip count of at most 1, skipping.";
+    return false;
+  }
+
   HloComputation* while_body = while_instr->while_body();
 
   // Maps instructions in the while body to instructions hoisted outside the
@@ -180,6 +187,13 @@ WhileLoopInvariantCodeMotion::TryHoistingInvariantInstructionsFromWhileBody(
     return false;
   }
 
+  // LICM in the presence of domain instructions is complex, bail.
+  for (auto* instruction : while_body->MakeInstructionPostOrder()) {
+    if (instruction->opcode() == HloOpcode::kDomain) {
+      return false;
+    }
+  }
+
   // instructions_to_replace[i] is hoisted into a loop invariant instruction
   // replacement_instructions[i].
   std::vector<HloInstruction*> instructions_to_replace;
diff --git a/tensorflow/compiler/xla/service/while_loop_invariant_code_motion_test.cc b/tensorflow/compiler/xla/service/while_loop_invariant_code_motion_test.cc
index 32e69c335b7..046ccb2d3f2 100644
--- a/tensorflow/compiler/xla/service/while_loop_invariant_code_motion_test.cc
+++ b/tensorflow/compiler/xla/service/while_loop_invariant_code_motion_test.cc
@@ -18,7 +18,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
 #include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/test.h"
-#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 
 namespace xla {
@@ -26,7 +26,7 @@ namespace {
 
 namespace op = xla::testing::opcode_matchers;
 
-class WhileLoopInvariantCodeMotionTest : public HloVerifiedTestBase {
+class WhileLoopInvariantCodeMotionTest : public HloTestBase {
  public:
   // Makes a computation which has one parameter, of the given shape, and always
   // returns PRED[]{true}.  This is useful as a dummy loop condition.
@@ -58,6 +58,7 @@ HloComputation* WhileLoopInvariantCodeMotionTest::MakeAlwaysTrueComputation(
 }
 
 TEST_F(WhileLoopInvariantCodeMotionTest, HoistOneInvariantOperation) {
+  auto m = CreateNewVerifiedModule();
   auto scalar_s32 = ShapeUtil::MakeShape(S32, {});
   Shape while_shape =
       ShapeUtil::MakeTupleShape({scalar_s32, scalar_s32, scalar_s32});
@@ -76,19 +77,18 @@ TEST_F(WhileLoopInvariantCodeMotionTest, HoistOneInvariantOperation) {
     builder.AddInstruction(
         HloInstruction::CreateTuple({gte_0, gte_1, add_result}));
 
-    return module().AddEmbeddedComputation(builder.Build());
+    return m->AddEmbeddedComputation(builder.Build());
   }();
 
   HloComputation::Builder builder(TestName());
   auto* init_value = builder.AddInstruction(
       HloInstruction::CreateParameter(0, while_shape, "init_value"));
   builder.AddInstruction(HloInstruction::CreateWhile(
-      while_shape, MakeAlwaysTrueComputation(while_shape, &module()),
-      while_body, init_value));
-  HloComputation* entry_computation =
-      module().AddEntryComputation(builder.Build());
+      while_shape, MakeAlwaysTrueComputation(while_shape, m.get()), while_body,
+      init_value));
+  HloComputation* entry_computation = m->AddEntryComputation(builder.Build());
   TF_ASSERT_OK_AND_ASSIGN(bool simplified_loop,
-                          WhileLoopInvariantCodeMotion{}.Run(&module()));
+                          WhileLoopInvariantCodeMotion{}.Run(m.get()));
   EXPECT_TRUE(simplified_loop);
 
   HloInstruction* transformed_while;
@@ -100,6 +100,7 @@ TEST_F(WhileLoopInvariantCodeMotionTest, HoistOneInvariantOperation) {
 }
 
 TEST_F(WhileLoopInvariantCodeMotionTest, HoistInvariantOperationTree) {
+  auto m = CreateNewVerifiedModule();
   auto scalar_s32 = ShapeUtil::MakeShape(S32, {});
   Shape while_shape =
       ShapeUtil::MakeTupleShape({scalar_s32, scalar_s32, scalar_s32});
@@ -135,19 +136,18 @@ TEST_F(WhileLoopInvariantCodeMotionTest, HoistInvariantOperationTree) {
     builder.AddInstruction(
         HloInstruction::CreateTuple({gte_0, gte_1, divide_result}));
 
-    return module().AddEmbeddedComputation(builder.Build());
+    return m->AddEmbeddedComputation(builder.Build());
   }();
 
   HloComputation::Builder builder(TestName());
   auto* init_value = builder.AddInstruction(
       HloInstruction::CreateParameter(0, while_shape, "init_value"));
   builder.AddInstruction(HloInstruction::CreateWhile(
-      while_shape, MakeAlwaysTrueComputation(while_shape, &module()),
-      while_body, init_value));
-  HloComputation* entry_computation =
-      module().AddEntryComputation(builder.Build());
+      while_shape, MakeAlwaysTrueComputation(while_shape, m.get()), while_body,
+      init_value));
+  HloComputation* entry_computation = m->AddEntryComputation(builder.Build());
   TF_ASSERT_OK_AND_ASSIGN(bool simplified_loop,
-                          WhileLoopInvariantCodeMotion{}.Run(&module()));
+                          WhileLoopInvariantCodeMotion{}.Run(m.get()));
   EXPECT_TRUE(simplified_loop);
 
   HloInstruction* transformed_while;
@@ -173,6 +173,7 @@ TEST_F(WhileLoopInvariantCodeMotionTest, HoistInvariantOperationTree) {
 TEST_F(WhileLoopInvariantCodeMotionTest,
        DontHoistTriviallyLoopVaryingComputation) {
   // Basic negative test: the add expression is not loop invariant.
+  auto m = CreateNewVerifiedModule();
   auto scalar_s32 = ShapeUtil::MakeShape(S32, {});
   Shape while_shape = ShapeUtil::MakeTupleShape({scalar_s32, scalar_s32});
 
@@ -189,20 +190,20 @@ TEST_F(WhileLoopInvariantCodeMotionTest,
             scalar_s32, HloOpcode::kAdd, gte_0, gte_1));
     builder.AddInstruction(HloInstruction::CreateTuple({gte_0, add_result}));
 
-    return module().AddEmbeddedComputation(builder.Build());
+    return m->AddEmbeddedComputation(builder.Build());
   }();
 
   HloComputation::Builder builder(TestName());
   auto* init_value = builder.AddInstruction(
       HloInstruction::CreateParameter(0, while_shape, "init_value"));
   auto* while_inst = builder.AddInstruction(HloInstruction::CreateWhile(
-      while_shape, MakeAlwaysTrueComputation(while_shape, &module()),
-      while_body, init_value));
+      while_shape, MakeAlwaysTrueComputation(while_shape, m.get()), while_body,
+      init_value));
 
-  module().AddEntryComputation(builder.Build());
+  m->AddEntryComputation(builder.Build());
 
   TF_ASSERT_OK_AND_ASSIGN(bool simplified_loop,
-                          WhileLoopInvariantCodeMotion{}.Run(&module()));
+                          WhileLoopInvariantCodeMotion{}.Run(m.get()));
   EXPECT_FALSE(simplified_loop);
 
   EXPECT_THAT(while_inst->while_body()->instructions(), Contains(op::Add()));
@@ -210,6 +211,7 @@ TEST_F(WhileLoopInvariantCodeMotionTest,
 
 TEST_F(WhileLoopInvariantCodeMotionTest,
        DontHoistLoopVaryingComputationWithAlternatingTuples) {
+  auto m = CreateNewVerifiedModule();
   auto scalar_s32 = ShapeUtil::MakeShape(S32, {});
   Shape while_shape =
       ShapeUtil::MakeTupleShape({scalar_s32, scalar_s32, scalar_s32});
@@ -228,25 +230,26 @@ TEST_F(WhileLoopInvariantCodeMotionTest,
     builder.AddInstruction(
         HloInstruction::CreateTuple({gte_1, gte_0, add_result}));
 
-    return module().AddEmbeddedComputation(builder.Build());
+    return m->AddEmbeddedComputation(builder.Build());
   }();
 
   HloComputation::Builder builder(TestName());
   auto* init_value = builder.AddInstruction(
       HloInstruction::CreateParameter(0, while_shape, "init_value"));
   auto* while_inst = builder.AddInstruction(HloInstruction::CreateWhile(
-      while_shape, MakeAlwaysTrueComputation(while_shape, &module()),
-      while_body, init_value));
+      while_shape, MakeAlwaysTrueComputation(while_shape, m.get()), while_body,
+      init_value));
 
-  module().AddEntryComputation(builder.Build());
+  m->AddEntryComputation(builder.Build());
   TF_ASSERT_OK_AND_ASSIGN(bool simplified_loop,
-                          WhileLoopInvariantCodeMotion{}.Run(&module()));
+                          WhileLoopInvariantCodeMotion{}.Run(m.get()));
   EXPECT_FALSE(simplified_loop);
 
   EXPECT_THAT(while_inst->while_body()->instructions(), Contains(op::Add()));
 }
 
 TEST_F(WhileLoopInvariantCodeMotionTest, DontHoistInstructionWithSideEffects) {
+  auto m = CreateNewVerifiedModule();
   auto scalar_s32 = ShapeUtil::MakeShape(S32, {});
   auto token_shape = ShapeUtil::MakeTokenShape();
   Shape while_shape =
@@ -267,7 +270,7 @@ TEST_F(WhileLoopInvariantCodeMotionTest, DontHoistInstructionWithSideEffects) {
     builder.AddInstruction(
         HloInstruction::CreateTuple({gte_0, gte_1, out_token}));
 
-    return module().AddEmbeddedComputation(builder.Build());
+    return m->AddEmbeddedComputation(builder.Build());
   }();
 
   HloComputation::Builder builder(TestName());
@@ -277,14 +280,14 @@ TEST_F(WhileLoopInvariantCodeMotionTest, DontHoistInstructionWithSideEffects) {
   auto* init_value = builder.AddInstruction(
       HloInstruction::CreateTuple({scalar_param, scalar_param, token}));
   auto* while_inst = builder.AddInstruction(HloInstruction::CreateWhile(
-      while_shape, MakeAlwaysTrueComputation(while_shape, &module()),
-      while_body, init_value));
+      while_shape, MakeAlwaysTrueComputation(while_shape, m.get()), while_body,
+      init_value));
   builder.AddInstruction(
       HloInstruction::CreateGetTupleElement(scalar_s32, while_inst, 0));
-  module().AddEntryComputation(builder.Build());
+  m->AddEntryComputation(builder.Build());
 
   TF_ASSERT_OK_AND_ASSIGN(bool simplified_loop,
-                          WhileLoopInvariantCodeMotion{}.Run(&module()));
+                          WhileLoopInvariantCodeMotion{}.Run(m.get()));
   ASSERT_FALSE(simplified_loop);
 
   EXPECT_THAT(while_inst->while_body()->instructions(),
@@ -294,6 +297,7 @@ TEST_F(WhileLoopInvariantCodeMotionTest, DontHoistInstructionWithSideEffects) {
 TEST_F(WhileLoopInvariantCodeMotionTest, DontHoistBitcastAlone) {
   // The bitcast's user, an outfeed, can't be hoisted, so don't hoist the
   // bitcast either.
+  auto m = CreateNewVerifiedModule();
   auto scalar_s32 = ShapeUtil::MakeShape(S32, {});
   auto scalar_f32 = ShapeUtil::MakeShape(F32, {});
   auto token_shape = ShapeUtil::MakeTokenShape();
@@ -317,7 +321,7 @@ TEST_F(WhileLoopInvariantCodeMotionTest, DontHoistBitcastAlone) {
     builder.AddInstruction(
         HloInstruction::CreateTuple({gte_0, gte_1, out_token}));
 
-    return module().AddEmbeddedComputation(builder.Build());
+    return m->AddEmbeddedComputation(builder.Build());
   }();
 
   HloComputation::Builder builder(TestName());
@@ -327,15 +331,15 @@ TEST_F(WhileLoopInvariantCodeMotionTest, DontHoistBitcastAlone) {
   auto* init_value = builder.AddInstruction(
       HloInstruction::CreateTuple({scalar_param, scalar_param, token}));
   auto* while_inst = builder.AddInstruction(HloInstruction::CreateWhile(
-      while_shape, MakeAlwaysTrueComputation(while_shape, &module()),
-      while_body, init_value));
+      while_shape, MakeAlwaysTrueComputation(while_shape, m.get()), while_body,
+      init_value));
   builder.AddInstruction(
       HloInstruction::CreateGetTupleElement(scalar_s32, while_inst, 0));
 
-  module().AddEntryComputation(builder.Build());
+  m->AddEntryComputation(builder.Build());
 
   TF_ASSERT_OK_AND_ASSIGN(bool simplified_loop,
-                          WhileLoopInvariantCodeMotion{}.Run(&module()));
+                          WhileLoopInvariantCodeMotion{}.Run(m.get()));
   EXPECT_FALSE(simplified_loop);
 
   EXPECT_THAT(while_inst->while_body()->instructions(),
@@ -346,6 +350,7 @@ TEST_F(WhileLoopInvariantCodeMotionTest, DontHoistBitcastAlone) {
 
 TEST_F(WhileLoopInvariantCodeMotionTest, HoistBitcastIfNeeded) {
   // The bitcast's user can be hoisted, so hoist the bitcast too.
+  auto m = CreateNewVerifiedModule();
   auto scalar_s32 = ShapeUtil::MakeShape(S32, {});
   auto scalar_f32 = ShapeUtil::MakeShape(F32, {});
   Shape while_shape =
@@ -367,21 +372,20 @@ TEST_F(WhileLoopInvariantCodeMotionTest, HoistBitcastIfNeeded) {
     builder.AddInstruction(
         HloInstruction::CreateTuple({gte_0, gte_1, add_inst}));
 
-    return module().AddEmbeddedComputation(builder.Build());
+    return m->AddEmbeddedComputation(builder.Build());
   }();
 
   HloComputation::Builder builder(TestName());
   auto* init_value = builder.AddInstruction(
       HloInstruction::CreateParameter(0, while_shape, "init_value"));
   builder.AddInstruction(HloInstruction::CreateWhile(
-      while_shape, MakeAlwaysTrueComputation(while_shape, &module()),
-      while_body, init_value));
+      while_shape, MakeAlwaysTrueComputation(while_shape, m.get()), while_body,
+      init_value));
 
-  HloComputation* entry_computation =
-      module().AddEntryComputation(builder.Build());
+  HloComputation* entry_computation = m->AddEntryComputation(builder.Build());
 
   TF_ASSERT_OK_AND_ASSIGN(bool simplified_loop,
-                          WhileLoopInvariantCodeMotion{}.Run(&module()));
+                          WhileLoopInvariantCodeMotion{}.Run(m.get()));
   EXPECT_TRUE(simplified_loop);
 
   HloInstruction* transformed_while;
@@ -396,6 +400,7 @@ TEST_F(WhileLoopInvariantCodeMotionTest, HoistBitcastIfNeeded) {
 }
 
 TEST_F(WhileLoopInvariantCodeMotionTest, DontHoistControlDependencies) {
+  auto m = CreateNewVerifiedModule();
   auto scalar_s32 = ShapeUtil::MakeShape(S32, {});
   Shape while_shape =
       ShapeUtil::MakeTupleShape({scalar_s32, scalar_s32, scalar_s32});
@@ -416,22 +421,23 @@ TEST_F(WhileLoopInvariantCodeMotionTest, DontHoistControlDependencies) {
     builder.AddInstruction(
         HloInstruction::CreateTuple({gte_0, gte_1, add_result}));
 
-    while_body = module().AddEmbeddedComputation(builder.Build());
+    while_body = m->AddEmbeddedComputation(builder.Build());
   }
 
   HloComputation::Builder builder(TestName());
   auto* init_value = builder.AddInstruction(
       HloInstruction::CreateParameter(0, while_shape, "init_value"));
   builder.AddInstruction(HloInstruction::CreateWhile(
-      while_shape, MakeAlwaysTrueComputation(while_shape, &module()),
-      while_body, init_value));
-  module().AddEntryComputation(builder.Build());
+      while_shape, MakeAlwaysTrueComputation(while_shape, m.get()), while_body,
+      init_value));
+  m->AddEntryComputation(builder.Build());
   TF_ASSERT_OK_AND_ASSIGN(bool simplified_loop,
-                          WhileLoopInvariantCodeMotion{}.Run(&module()));
+                          WhileLoopInvariantCodeMotion{}.Run(m.get()));
   EXPECT_FALSE(simplified_loop);
 }
 
 TEST_F(WhileLoopInvariantCodeMotionTest, BodyHasNonTupleRoot) {
+  auto m = CreateNewVerifiedModule();
   auto scalar_s32 = ShapeUtil::MakeShape(S32, {});
   Shape while_shape = ShapeUtil::MakeTupleShape({scalar_s32, scalar_s32});
 
@@ -439,7 +445,7 @@ TEST_F(WhileLoopInvariantCodeMotionTest, BodyHasNonTupleRoot) {
     HloComputation::Builder builder(TestName() + ".passthrough");
     HloInstruction* param = builder.AddInstruction(
         HloInstruction::CreateParameter(0, while_shape, "param"));
-    HloComputation* result = module().AddEmbeddedComputation(builder.Build());
+    HloComputation* result = m->AddEmbeddedComputation(builder.Build());
 
     result->AddInstruction(
         HloInstruction::CreateGetTupleElement(scalar_s32, param, 1));
@@ -450,11 +456,11 @@ TEST_F(WhileLoopInvariantCodeMotionTest, BodyHasNonTupleRoot) {
   auto* init_value = builder.AddInstruction(
       HloInstruction::CreateParameter(0, while_shape, "init_value"));
   builder.AddInstruction(HloInstruction::CreateWhile(
-      while_shape, MakeAlwaysTrueComputation(while_shape, &module()),
-      while_body, init_value));
-  module().AddEntryComputation(builder.Build());
+      while_shape, MakeAlwaysTrueComputation(while_shape, m.get()), while_body,
+      init_value));
+  m->AddEntryComputation(builder.Build());
   TF_ASSERT_OK_AND_ASSIGN(bool simplified_loop,
-                          WhileLoopInvariantCodeMotion{}.Run(&module()));
+                          WhileLoopInvariantCodeMotion{}.Run(m.get()));
   EXPECT_FALSE(simplified_loop);
 }
 
@@ -482,14 +488,14 @@ ENTRY entry {
 )";
 
 TEST_F(WhileLoopInvariantCodeMotionTest, HoistsConstantWhenAsked) {
-  ParseAndVerifyModule(kConstantHoistingTestCase);
+  auto m = ParseAndReturnVerifiedModule(kConstantHoistingTestCase).ValueOrDie();
 
   TF_ASSERT_OK_AND_ASSIGN(
       bool simplified_loop,
-      WhileLoopInvariantCodeMotion{/*hoist_constants=*/true}.Run(&module()));
+      WhileLoopInvariantCodeMotion{/*hoist_constants=*/true}.Run(m.get()));
   EXPECT_TRUE(simplified_loop);
 
-  HloComputation* while_body = module().GetComputationWithName("wide.body");
+  HloComputation* while_body = m->GetComputationWithName("wide.body");
   ASSERT_NE(while_body, nullptr);
 
   // We expect the while body to be the equivalent of:
@@ -523,10 +529,44 @@ TEST_F(WhileLoopInvariantCodeMotionTest, HoistsConstantWhenAsked) {
 }
 
 TEST_F(WhileLoopInvariantCodeMotionTest, DoesNotHoistConstantByDefault) {
-  ParseAndVerifyModule(kConstantHoistingTestCase);
+  auto m = ParseAndReturnVerifiedModule(kConstantHoistingTestCase).ValueOrDie();
 
   TF_ASSERT_OK_AND_ASSIGN(bool simplified_loop,
-                          WhileLoopInvariantCodeMotion{}.Run(&module()));
+                          WhileLoopInvariantCodeMotion{}.Run(m.get()));
+  EXPECT_FALSE(simplified_loop);
+}
+
+TEST_F(WhileLoopInvariantCodeMotionTest, DoNotHoistOutOfSingleIteration) {
+  const char* const kHloModule = R"(
+    HloModule ModuleWithWhile
+
+    body {
+      p_body = (f32[2], f32[2], f32[2], s32[]) parameter(0)
+      val.0 = f32[2] get-tuple-element(p_body), index=0
+      val.1 = f32[2] get-tuple-element(p_body), index=1
+      add = f32[2] add(val.0, val.1)
+      const = s32[] constant(-1)
+      ROOT root = (f32[2], f32[2], f32[2], s32[]) tuple(val.0, val.1, add, const)
+    }
+
+    condition {
+      p_cond = (f32[2], f32[2], f32[2], s32[]) parameter(0)
+      gte = s32[] get-tuple-element(p_cond), index=3
+      const = s32[] constant(42)
+      ROOT result = pred[] equal-to(gte, const)
+    }
+
+    ENTRY entry {
+      param.0 = f32[2] parameter(0)
+      param.1 = s32[] parameter(1)
+      while_init = (f32[2], f32[2], f32[2], s32[]) tuple(param.0, param.0, param.0, param.1)
+      ROOT while = (f32[2], f32[2], f32[2], s32[]) while(while_init), condition=condition, body=body
+    })";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(kHloModule));
+
+  TF_ASSERT_OK_AND_ASSIGN(bool simplified_loop,
+                          WhileLoopInvariantCodeMotion{}.Run(module.get()));
   EXPECT_FALSE(simplified_loop);
 }
 
diff --git a/tensorflow/compiler/xla/service/while_loop_simplifier.cc b/tensorflow/compiler/xla/service/while_loop_simplifier.cc
index 630d71e5ca2..6f924a29d8a 100644
--- a/tensorflow/compiler/xla/service/while_loop_simplifier.cc
+++ b/tensorflow/compiler/xla/service/while_loop_simplifier.cc
@@ -20,40 +20,14 @@ limitations under the License.
 #include "absl/strings/str_join.h"
 #include "absl/types/optional.h"
 #include "tensorflow/compiler/xla/service/call_inliner.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_query.h"
 #include "tensorflow/compiler/xla/service/while_loop_analysis.h"
 
 namespace xla {
 
 using absl::optional;
-
-// Determines whether the given instruction is a send/recv node, or has a
-// subcomputation which contains a send/recv node.
-static bool IsOrContainsSendOrRecv(const HloInstruction* instr);
-
-// Determines whether the given computation contains a send or recv node.
-static bool ContainsSendOrRecv(const HloComputation* comp) {
-  for (const auto* instr : comp->instructions()) {
-    if (IsOrContainsSendOrRecv(instr)) {
-      return true;
-    }
-  }
-  return false;
-}
-
-static bool IsOrContainsSendOrRecv(const HloInstruction* instr) {
-  if (instr->opcode() == HloOpcode::kSend ||
-      instr->opcode() == HloOpcode::kSendDone ||
-      instr->opcode() == HloOpcode::kRecv ||
-      instr->opcode() == HloOpcode::kRecvDone) {
-    return true;
-  }
-  for (const auto& subcomp : instr->called_computations()) {
-    if (ContainsSendOrRecv(subcomp)) {
-      return true;
-    }
-  }
-  return false;
-}
+using hlo_query::ContainsInstrWithOpcode;
 
 // Tries to remove elements in a while loop's tuple that aren't used within the
 // loop.
@@ -253,7 +227,7 @@ static StatusOr<bool> TryRemoveDeadWhileParams(HloInstruction* while_op) {
   // Create the new while condition, body, and init value.
   std::unique_ptr<HloComputation> new_while_cond =
       while_cond->CloneWithReplacements(
-          make_while_computation_replacements(while_cond), /*extras=*/{});
+          make_while_computation_replacements(while_cond));
 
   std::unordered_map<const HloInstruction*, std::unique_ptr<HloInstruction>>
       while_body_replacements = make_while_computation_replacements(while_body);
@@ -266,8 +240,7 @@ static StatusOr<bool> TryRemoveDeadWhileParams(HloInstruction* while_op) {
   while_body_replacements.emplace(
       while_body_root, HloInstruction::CreateTuple(new_while_body_root_elems));
   std::unique_ptr<HloComputation> new_while_body =
-      while_body->CloneWithReplacements(std::move(while_body_replacements),
-                                        /*extras=*/{});
+      while_body->CloneWithReplacements(std::move(while_body_replacements));
 
   // Add a new while_init instruction that repackages the old while_init
   // instruction's elements.  We rely on the AlgebraicSimplifier and DCE to
@@ -458,6 +431,180 @@ static StatusOr<bool> TryPropagateConstant(HloInstruction* while_op) {
   return changed_cond || changed_body;
 }
 
+// Converts a flat list of instructions into a tuple of the desired shape.  For
+// example, given a tuple shape ((x, x), x) and instructions {A, B, C}, returns
+// a tuple of value ((A, B), C).
+//
+// desired_shape must be a tuple.  (This precondition allows us to return a
+// unique_ptr rather than a raw ptr.)
+static std::unique_ptr<HloInstruction> UnflattenTupleInstr(
+    absl::Span<HloInstruction*> instrs, const Shape& desired_shape,
+    std::vector<std::unique_ptr<HloInstruction>>* new_instrs) {
+  CHECK(ShapeUtil::IsTuple(desired_shape))
+      << ShapeUtil::HumanString(desired_shape);
+
+  // For each child shape in `desired_shape`, slice out the correct number of
+  // `instrs` and call UnflattenTupleInstr recursively.  At each step we remove
+  // elements from `instrs` so that it only contains instructions we have not
+  // yet processed.
+  std::vector<HloInstruction*> elems;
+  for (int64 i = 0; i < desired_shape.tuple_shapes_size(); ++i) {
+    const Shape& subshape = desired_shape.tuple_shapes(i);
+    if (!ShapeUtil::IsTuple(subshape)) {
+      elems.push_back(instrs[0]);
+      instrs.remove_prefix(1);
+      continue;
+    }
+
+    // Count the number of leaf nodes underneath desired_shape[i].
+    int64 num_leaves = 0;
+    ShapeUtil::ForEachSubshape(
+        subshape, [&](const Shape& s, const ShapeIndex& /*index*/) {
+          if (!ShapeUtil::IsTuple(s)) {
+            ++num_leaves;
+          }
+        });
+
+    std::unique_ptr<HloInstruction> subinstr =
+        UnflattenTupleInstr(instrs.subspan(0, num_leaves),
+                            desired_shape.tuple_shapes(i), new_instrs);
+    elems.push_back(subinstr.get());
+    new_instrs->push_back(std::move(subinstr));
+    instrs.remove_prefix(num_leaves);
+  }
+  return HloInstruction::CreateTuple(elems);
+}
+
+// Builds a vector whose elements are the values in the flattened tuple for
+// `instr`.  For example, if `instr` is a tuple of form ((A, B), C), returns the
+// vector {A, B, C} (or kGetTupleElement ops which point to A, B, and C).
+static std::vector<HloInstruction*> GetFlatTupleElems(
+    HloInstruction* instr,
+    std::vector<std::unique_ptr<HloInstruction>>* new_instrs) {
+  const auto& shape = instr->shape();
+  if (!ShapeUtil::IsTuple(shape)) {
+    return {instr};
+  }
+  std::vector<HloInstruction*> elems;
+  for (int64 i = 0; i < shape.tuple_shapes_size(); ++i) {
+    const Shape& subshape = shape.tuple_shapes(i);
+    new_instrs->push_back(
+        HloInstruction::CreateGetTupleElement(subshape, instr, i));
+    auto* gte = new_instrs->back().get();
+    auto flattened_subshape = GetFlatTupleElems(gte, new_instrs);
+    elems.insert(elems.end(), flattened_subshape.begin(),
+                 flattened_subshape.end());
+  }
+  return elems;
+}
+
+static StatusOr<bool> TryFlattenNestedTuples(HloInstruction* while_op) {
+  HloModule* module = while_op->GetModule();
+  HloComputation* computation = while_op->parent();
+  auto* while_init = while_op->mutable_operand(0);
+  auto* while_body = while_op->while_body();
+  auto* while_cond = while_op->while_condition();
+  auto* while_body_root = while_body->root_instruction();
+  if (while_init->opcode() != HloOpcode::kTuple ||
+      while_body_root->opcode() != HloOpcode::kTuple) {
+    return false;
+  }
+
+  TF_RET_CHECK(while_cond->num_parameters() == 1);
+  TF_RET_CHECK(while_body->num_parameters() == 1);
+  TF_RET_CHECK(
+      ShapeUtil::Compatible(while_init->shape(), while_body_root->shape()));
+  Shape while_shape = while_init->shape();
+  if (!ShapeUtil::IsNestedTuple(while_shape)) {
+    return false;
+  }
+
+  // Cowardly refuse to perform this optimization in the presence of kDomain
+  // instructions, which may reference other instructions in the loop and
+  // therefore make this complicated.
+  if (ContainsInstrWithOpcode(while_body, {HloOpcode::kDomain}) ||
+      ContainsInstrWithOpcode(while_cond, {HloOpcode::kDomain})) {
+    return false;
+  }
+
+  std::vector<Shape> flattened_shape_elems;
+  ShapeUtil::ForEachSubshape(while_shape,
+                             [&](const Shape& s, const ShapeIndex& /*index*/) {
+                               if (!ShapeUtil::IsTuple(s)) {
+                                 flattened_shape_elems.push_back(s);
+                               }
+                             });
+  Shape flattened_shape = ShapeUtil::MakeTupleShape(flattened_shape_elems);
+
+  // `new_instrs` holds instructions created outside of a computation for
+  // cloning.  Elements added here just need to live until the end of the
+  // relevant CloneWithReplacement call.
+  std::vector<std::unique_ptr<HloInstruction>> new_instrs;
+  auto add_new_instr = [&](std::unique_ptr<HloInstruction> instr) {
+    new_instrs.push_back(std::move(instr));
+    return new_instrs.back().get();
+  };
+
+  auto nested = [&](HloInstruction* instr) {
+    std::vector<HloInstruction*> gtes;
+    const Shape& flat_shape = instr->shape();
+    for (int64 i = 0; i < flat_shape.tuple_shapes_size(); ++i) {
+      gtes.push_back(add_new_instr(HloInstruction::CreateGetTupleElement(
+          flat_shape.tuple_shapes(i), instr, i)));
+    }
+    auto nested_instr =
+        UnflattenTupleInstr(absl::MakeSpan(gtes), while_shape, &new_instrs);
+    CHECK(ShapeUtil::Compatible(nested_instr->shape(), while_shape))
+        << ShapeUtil::HumanString(nested_instr->shape()) << " vs "
+        << ShapeUtil::HumanString(while_shape);
+    return nested_instr;
+  };
+
+  auto flattened = [&](HloInstruction* instr) {
+    return HloInstruction::CreateTuple(GetFlatTupleElems(instr, &new_instrs));
+  };
+
+  // Create a new while-condition computation, where parameter 0 has flat shape
+  // but all uses of it go through the nested shape.
+  std::unique_ptr<HloComputation> new_while_cond =
+      while_cond->CloneWithReplacementPairs({
+          while_cond->parameter_instruction(0),
+          nested(add_new_instr(HloInstruction::CreateParameter(
+              0, flattened_shape,
+              while_cond->parameter_instruction(0)->name()))),
+      });
+
+  // Create a new while-body computation, where parameter 0 has a flat shape and
+  // all uses of it go through the nested shape, and where the root has a flat
+  // shape constructed from the old nested root.
+  std::unique_ptr<HloComputation> new_while_body =
+      while_body->CloneWithReplacementPairs(
+          {
+              while_body->parameter_instruction(0),
+              nested(add_new_instr(HloInstruction::CreateParameter(
+                  0, flattened_shape,
+                  while_body->parameter_instruction(0)->name()))),
+          },
+          {
+              while_body->root_instruction(),
+              flattened(add_new_instr(while_body->root_instruction()->Clone())),
+          });
+
+  // Create the final while loop, and add any new instructions created to
+  // `computation`.
+  new_instrs.clear();
+  TF_RETURN_IF_ERROR(computation->ReplaceWithNewInstruction(
+      while_op, nested(computation->AddInstruction(HloInstruction::CreateWhile(
+                    flattened_shape,
+                    module->AddEmbeddedComputation(std::move(new_while_cond)),
+                    module->AddEmbeddedComputation(std::move(new_while_body)),
+                    computation->AddInstruction(flattened(while_init)))))));
+  for (auto& instr : new_instrs) {
+    computation->AddInstruction(std::move(instr));
+  }
+  return true;
+}
+
 StatusOr<bool> WhileLoopSimplifier::Run(HloModule* module) {
   XLA_VLOG_LINES(3,
                  "WhileLoopSimplifier::Run(), before:\n" + module->ToString());
@@ -478,32 +625,46 @@ StatusOr<bool> WhileLoopSimplifier::Run(HloModule* module) {
   for (HloInstruction* while_op : while_ops) {
     // We can't remove while loops that contain send/recv nodes, because we rely
     // on the particular loop structure around the node matching on the send and
-    // recv sides.  Removing dead while params requires us to remove the loop
+    // recv sides.  Other while simplifications require us to remove the loop
     // and replace it with a new one, so we can't do that either.
-    if (ContainsSendOrRecv(while_op->while_body()) ||
-        ContainsSendOrRecv(while_op->while_condition())) {
+    if (ContainsInstrWithOpcode(while_op->while_body(),
+                                {HloOpcode::kSend, HloOpcode::kSendDone,
+                                 HloOpcode::kRecv, HloOpcode::kRecvDone}) ||
+        ContainsInstrWithOpcode(while_op->while_condition(),
+                                {HloOpcode::kSend, HloOpcode::kSendDone,
+                                 HloOpcode::kRecv, HloOpcode::kRecvDone})) {
       VLOG(2) << "Not attempting to simplify while loop because it contains a "
                  "send/recv node: "
               << while_op->ToShortString();
       continue;
     }
 
-    StatusOr<bool> result = TryPropagateConstant(while_op);
-    TF_RETURN_IF_ERROR(result.status());
-    changed |= result.ValueOrDie();
+    TF_ASSIGN_OR_RETURN(bool result, TryPropagateConstant(while_op));
+    changed |= result;
 
-    result = TryRemoveWhileLoop(while_op);
-    TF_RETURN_IF_ERROR(result.status());
-    if (result.ValueOrDie()) {
-      changed = true;
-      // Don't try to remove dead while params after successfully removing the
-      // while loop -- that would result in use-after-free nastiness.
+    TF_ASSIGN_OR_RETURN(result, TryRemoveWhileLoop(while_op));
+    changed |= result;
+    if (result) {
+      // Don't continue simplifying after successfully removing the while loop
+      // -- that would result in use-after-free nastiness.
       continue;
     }
 
-    result = TryRemoveDeadWhileParams(while_op);
-    TF_RETURN_IF_ERROR(result.status());
-    changed |= result.ValueOrDie();
+    TF_ASSIGN_OR_RETURN(result, TryFlattenNestedTuples(while_op));
+    changed |= result;
+    if (result) {
+      // Successfully flattening nested tuples results in us cloning and
+      // replacing the while loop, meaning that `while_op` is no longer valid.
+      continue;
+    }
+
+    TF_ASSIGN_OR_RETURN(result, TryRemoveDeadWhileParams(while_op));
+    changed |= result;
+    if (result) {
+      // Successfully removing dead while params results in us cloning and
+      // replacing the while loop, meaning that `while_op` is no longer valid.
+      continue;
+    }
   }
 
   XLA_VLOG_LINES(3,
diff --git a/tensorflow/compiler/xla/service/while_loop_simplifier.h b/tensorflow/compiler/xla/service/while_loop_simplifier.h
index 0bc5a0107bb..a378f179c63 100644
--- a/tensorflow/compiler/xla/service/while_loop_simplifier.h
+++ b/tensorflow/compiler/xla/service/while_loop_simplifier.h
@@ -25,11 +25,22 @@ namespace xla {
 // HLO pass that makes the following transformations on while loops:
 //
 //  - A while loop with static trip count of 0 is deleted.
+//
 //  - A while loop with static trip count of 1 is replaced by its body (sans
 //    loop).
+//
 //  - Elements of a while loop's tuple that the loop doesn't use are removed
 //    from the tuple.
 //
+//  - If the while loop's parameter is a nested tuple, it's flattened to a
+//    single-level tuple.  This is good because it usually reduces the number of
+//    kTuple instructions, but also because it unlocks additional optimizations
+//    (e.g. removing unused loop parameters).
+//
+// Flattening nested while loop tuples adds a whole mess of likely unnecessary
+// kGetTupleElement and kTuple operations to the graph.  We expect that tuple
+// simplifier will be run afterwards.
+//
 class WhileLoopSimplifier : public HloModulePass {
  public:
   ~WhileLoopSimplifier() override {}
diff --git a/tensorflow/compiler/xla/service/while_loop_simplifier_test.cc b/tensorflow/compiler/xla/service/while_loop_simplifier_test.cc
index 1c892ba179e..05005e0b262 100644
--- a/tensorflow/compiler/xla/service/while_loop_simplifier_test.cc
+++ b/tensorflow/compiler/xla/service/while_loop_simplifier_test.cc
@@ -17,9 +17,11 @@ limitations under the License.
 
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_replace.h"
+#include "tensorflow/compiler/xla/service/hlo_dce.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
 #include "tensorflow/compiler/xla/test.h"
-#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 
 namespace xla {
@@ -27,18 +29,21 @@ namespace {
 
 namespace op = xla::testing::opcode_matchers;
 
-class WhileLoopSimplifierTest : public HloVerifiedTestBase {
+class WhileLoopSimplifierTest : public HloTestBase {
  protected:
   // Makes an HloModule that contains a loop with `num_iters` iteration.
-  void MakeModuleWithSimpleLoop(int num_iters);
+  TF_MUST_USE_RESULT std::unique_ptr<VerifiedHloModule>
+  MakeModuleWithSimpleLoop(int num_iters);
 
   // Similar to MakeModuleWithSimpleLoop except that the loop bound is passed to
   // the loop-condition through an element of a tuple which is the
   // loop-condition parameter.
-  void MakeModuleWithSimpleLoopTupleElementLoopBound(int num_iters);
+  TF_MUST_USE_RESULT std::unique_ptr<VerifiedHloModule>
+  MakeModuleWithSimpleLoopTupleElementLoopBound(int num_iters);
 };
 
-void WhileLoopSimplifierTest::MakeModuleWithSimpleLoop(int num_iters) {
+std::unique_ptr<VerifiedHloModule>
+WhileLoopSimplifierTest::MakeModuleWithSimpleLoop(int num_iters) {
   string hlo_string_template = R"(
   HloModule SimpleLoop
   SimpleLoop.body {
@@ -67,10 +72,11 @@ void WhileLoopSimplifierTest::MakeModuleWithSimpleLoop(int num_iters) {
 
   string hlo_string = absl::StrReplaceAll(
       hlo_string_template, {{"{{LOOP_BOUND}}", absl::StrCat(42 + num_iters)}});
-  ParseAndVerifyModule(hlo_string);
+  return ParseAndReturnVerifiedModule(hlo_string).ValueOrDie();
 }
 
-void WhileLoopSimplifierTest::MakeModuleWithSimpleLoopTupleElementLoopBound(
+std::unique_ptr<VerifiedHloModule>
+WhileLoopSimplifierTest::MakeModuleWithSimpleLoopTupleElementLoopBound(
     int num_iters) {
   string hlo_string_template = R"(
   HloModule SimpleLoopWithIndirectLoopBound
@@ -104,60 +110,55 @@ void WhileLoopSimplifierTest::MakeModuleWithSimpleLoopTupleElementLoopBound(
 
   string hlo_string = absl::StrReplaceAll(
       hlo_string_template, {{"{{LOOP_BOUND}}", absl::StrCat(42 + num_iters)}});
-  ParseAndVerifyModule(hlo_string);
+  return ParseAndReturnVerifiedModule(hlo_string).ValueOrDie();
 }
 
 TEST_F(WhileLoopSimplifierTest, LoopWithZeroIterationSimiplified) {
-  MakeModuleWithSimpleLoop(/*num_iters=*/0);
-  HloModule* the_module = &module();
-  ASSERT_TRUE(WhileLoopSimplifier().Run(the_module).ValueOrDie());
-  EXPECT_THAT(the_module->entry_computation()->root_instruction(),
+  auto m = MakeModuleWithSimpleLoop(/*num_iters=*/0);
+  ASSERT_TRUE(WhileLoopSimplifier().Run(m.get()).ValueOrDie());
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
               op::Tuple(op::Constant(), op::Constant()));
 }
 
 TEST_F(WhileLoopSimplifierTest,
        LoopWithZeroIterationTupleElementLoopBoundSimplified) {
-  MakeModuleWithSimpleLoopTupleElementLoopBound(/*num_iters=*/0);
-  HloModule* the_module = &module();
-  ASSERT_TRUE(WhileLoopSimplifier().Run(the_module).ValueOrDie());
-  EXPECT_THAT(the_module->entry_computation()->root_instruction(),
+  auto m = MakeModuleWithSimpleLoopTupleElementLoopBound(/*num_iters=*/0);
+  ASSERT_TRUE(WhileLoopSimplifier().Run(m.get()).ValueOrDie());
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
               op::Tuple(op::Constant(), op::Constant(), op::Constant()));
 }
 
 TEST_F(WhileLoopSimplifierTest, LoopWithOneIterationSimplified) {
-  MakeModuleWithSimpleLoop(/*num_iters=*/1);
-  HloModule* the_module = &module();
-  ASSERT_TRUE(WhileLoopSimplifier().Run(the_module).ValueOrDie());
-  EXPECT_THAT(the_module->entry_computation()->root_instruction(),
+  auto m = MakeModuleWithSimpleLoop(/*num_iters=*/1);
+  ASSERT_TRUE(WhileLoopSimplifier().Run(m.get()).ValueOrDie());
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
               op::Tuple(op::Add(), op::Multiply()));
 }
 
 TEST_F(WhileLoopSimplifierTest,
        LoopWithOneIterationTupleELementLoopBoundSimplified) {
-  MakeModuleWithSimpleLoopTupleElementLoopBound(/*num_iters=*/1);
-  HloModule* the_module = &module();
-  ASSERT_TRUE(WhileLoopSimplifier().Run(the_module).ValueOrDie());
-  EXPECT_THAT(the_module->entry_computation()->root_instruction(),
+  auto m = MakeModuleWithSimpleLoopTupleElementLoopBound(/*num_iters=*/1);
+  ASSERT_TRUE(WhileLoopSimplifier().Run(m.get()).ValueOrDie());
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
               op::Tuple(op::Add(), op::Multiply(), op::Constant()));
 }
 
 TEST_F(WhileLoopSimplifierTest, LoopWithTwoIterationsNotSimplified) {
-  MakeModuleWithSimpleLoop(/*num_iters=*/2);
-  EXPECT_FALSE(WhileLoopSimplifier().Run(&module()).ValueOrDie());
+  auto m = MakeModuleWithSimpleLoop(/*num_iters=*/2);
+  EXPECT_FALSE(WhileLoopSimplifier().Run(m.get()).ValueOrDie());
 }
 
 TEST_F(WhileLoopSimplifierTest,
        LoopWithControlDependencySimplifiedDependencyPreserved) {
-  MakeModuleWithSimpleLoop(/*num_iters=*/1);
-  HloModule* the_module = &module();
-  HloComputation* computation = the_module->entry_computation();
+  auto m = MakeModuleWithSimpleLoop(/*num_iters=*/1);
+  HloComputation* computation = m->entry_computation();
   auto* while_op = computation->root_instruction();
   ASSERT_EQ(while_op->opcode(), HloOpcode::kWhile);
   auto* true_op = while_op->while_body()->AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::CreateR0<bool>(true)));
   TF_ASSERT_OK(true_op->AddControlDependencyTo(
       while_op->while_body()->root_instruction()));
-  ASSERT_TRUE(WhileLoopSimplifier().Run(the_module).ValueOrDie());
+  ASSERT_TRUE(WhileLoopSimplifier().Run(m.get()).ValueOrDie());
   EXPECT_THAT(computation->root_instruction()->control_predecessors(),
               ElementsAre(op::Constant()))
       << computation->ToString();
@@ -166,9 +167,8 @@ TEST_F(WhileLoopSimplifierTest,
 // Loops that contain send/recv nodes can't be simplified; the loop structure
 // around send/recv nodes must be preserved.
 TEST_F(WhileLoopSimplifierTest, LoopWithSendNotSimplified) {
-  MakeModuleWithSimpleLoop(/*num_iters=*/1);
-  HloModule* the_module = &module();
-  HloComputation* computation = the_module->entry_computation();
+  auto m = MakeModuleWithSimpleLoop(/*num_iters=*/1);
+  HloComputation* computation = m->entry_computation();
   auto* while_op = computation->root_instruction();
   ASSERT_EQ(while_op->opcode(), HloOpcode::kWhile);
   auto* while_body = while_op->while_body();
@@ -179,13 +179,12 @@ TEST_F(WhileLoopSimplifierTest, LoopWithSendNotSimplified) {
       token,
       /*channel_id=*/0));
   while_body->AddInstruction(HloInstruction::CreateSendDone(send));
-  EXPECT_FALSE(WhileLoopSimplifier().Run(the_module).ValueOrDie());
+  EXPECT_FALSE(WhileLoopSimplifier().Run(m.get()).ValueOrDie());
 }
 
 TEST_F(WhileLoopSimplifierTest, LoopWithRecvNotSimplified) {
-  MakeModuleWithSimpleLoop(/*num_iters=*/1);
-  HloModule* the_module = &module();
-  HloComputation* computation = the_module->entry_computation();
+  auto m = MakeModuleWithSimpleLoop(/*num_iters=*/1);
+  HloComputation* computation = m->entry_computation();
   auto* while_op = computation->root_instruction();
   ASSERT_EQ(while_op->opcode(), HloOpcode::kWhile);
   auto* while_body = while_op->while_body();
@@ -194,7 +193,7 @@ TEST_F(WhileLoopSimplifierTest, LoopWithRecvNotSimplified) {
       HloInstruction::CreateRecv(ShapeUtil::MakeShape(F32, {1}), token,
                                  /*channel_id=*/0));
   while_body->AddInstruction(HloInstruction::CreateRecvDone(recv));
-  EXPECT_FALSE(WhileLoopSimplifier().Run(the_module).ValueOrDie());
+  EXPECT_FALSE(WhileLoopSimplifier().Run(m.get()).ValueOrDie());
 }
 
 // The limitation on not being able to simplify loops that contain infeeds (and
@@ -202,16 +201,15 @@ TEST_F(WhileLoopSimplifierTest, LoopWithRecvNotSimplified) {
 // fact that our infrastructure sees simplifying such a loop as tantamount to
 // removing the non-removable instruction.
 TEST_F(WhileLoopSimplifierTest, LoopWithInfeedNotSimplified) {
-  MakeModuleWithSimpleLoop(/*num_iters=*/1);
-  HloModule* the_module = &module();
-  HloComputation* computation = the_module->entry_computation();
+  auto m = MakeModuleWithSimpleLoop(/*num_iters=*/1);
+  HloComputation* computation = m->entry_computation();
   auto* while_op = computation->root_instruction();
   ASSERT_EQ(while_op->opcode(), HloOpcode::kWhile);
   auto* while_body = while_op->while_body();
   auto token = while_body->AddInstruction(HloInstruction::CreateToken());
   while_body->AddInstruction(HloInstruction::CreateInfeed(
       ShapeUtil::MakeShape(F32, {1}), token, "config"));
-  EXPECT_FALSE(WhileLoopSimplifier().Run(the_module).ValueOrDie());
+  EXPECT_FALSE(WhileLoopSimplifier().Run(m.get()).ValueOrDie());
 }
 
 // A non-tuple shaped loop shouldn't be simplified or crash the compiler.
@@ -236,8 +234,8 @@ TEST_F(WhileLoopSimplifierTest, NonTupleShapedLoopNotSimplified) {
   }
   )";
 
-  ParseAndVerifyModule(hlo_string);
-  EXPECT_FALSE(WhileLoopSimplifier().Run(&module()).ValueOrDie());
+  auto m = ParseAndReturnVerifiedModule(hlo_string).ValueOrDie();
+  EXPECT_FALSE(WhileLoopSimplifier().Run(m.get()).ValueOrDie());
 }
 
 // A while loop that does nothing else besides swapping tuple elements
@@ -268,8 +266,8 @@ TEST_F(WhileLoopSimplifierTest, LoopSwappingTupleElementsNotSimplified) {
   }
   )";
 
-  ParseAndVerifyModule(hlo_string);
-  EXPECT_FALSE(WhileLoopSimplifier().Run(&module()).ValueOrDie());
+  auto m = ParseAndReturnVerifiedModule(hlo_string).ValueOrDie();
+  EXPECT_FALSE(WhileLoopSimplifier().Run(m.get()).ValueOrDie());
 }
 
 // Construct a loop where we assign a constant to tuple element 0 in each
@@ -297,8 +295,8 @@ TEST_F(WhileLoopSimplifierTest,
   }
   )";
 
-  ParseAndVerifyModule(hlo_string);
-  EXPECT_FALSE(WhileLoopSimplifier().Run(&module()).ValueOrDie());
+  auto m = ParseAndReturnVerifiedModule(hlo_string).ValueOrDie();
+  EXPECT_FALSE(WhileLoopSimplifier().Run(m.get()).ValueOrDie());
 }
 
 // Nothing to simplify in a while loop whose tuple has 0 elements.
@@ -320,8 +318,8 @@ TEST_F(WhileLoopSimplifierTest, LoopWithEmptyTupleNotSimplified) {
   }
   )";
 
-  ParseAndVerifyModule(hlo_string);
-  EXPECT_FALSE(WhileLoopSimplifier().Run(&module()).ValueOrDie());
+  auto m = ParseAndReturnVerifiedModule(hlo_string).ValueOrDie();
+  EXPECT_FALSE(WhileLoopSimplifier().Run(m.get()).ValueOrDie());
 }
 
 // While loop where one tuple element is used twice in the body, and thus can't
@@ -348,8 +346,8 @@ TEST_F(WhileLoopSimplifierTest, LoopWithElemUsedTwiceNotSimplified) {
   }
   )";
 
-  ParseAndVerifyModule(hlo_string);
-  EXPECT_FALSE(WhileLoopSimplifier().Run(&module()).ValueOrDie());
+  auto m = ParseAndReturnVerifiedModule(hlo_string).ValueOrDie();
+  EXPECT_FALSE(WhileLoopSimplifier().Run(m.get()).ValueOrDie());
 }
 
 // This while loop has three tuple elements.  Element 0 is unused and should be
@@ -390,16 +388,15 @@ TEST_F(WhileLoopSimplifierTest, RemoveUnusedLoopOperands) {
   }
   )";
 
-  ParseAndVerifyModule(hlo_string);
-  HloModule* the_module = &module();
-  EXPECT_TRUE(WhileLoopSimplifier().Run(the_module).ValueOrDie());
+  auto m = ParseAndReturnVerifiedModule(hlo_string).ValueOrDie();
+  EXPECT_TRUE(WhileLoopSimplifier().Run(m.get()).ValueOrDie());
 
   // The original while instruction is still left in the module as a dead
   // instruction, find a while instruction with a different name as the new
   // while instruction.
   HloInstruction* new_while_op =
-      *std::find_if(the_module->entry_computation()->instructions().begin(),
-                    the_module->entry_computation()->instructions().end(),
+      *std::find_if(m->entry_computation()->instructions().begin(),
+                    m->entry_computation()->instructions().end(),
                     [&](const HloInstruction* instr) {
                       return (instr->opcode() == HloOpcode::kWhile &&
                               instr->name() != "while");
@@ -440,8 +437,8 @@ TEST_F(WhileLoopSimplifierTest, LoopWithNonTupleBodyShapeNotSimplified) {
   }
   )";
 
-  ParseAndVerifyModule(hlo_string);
-  EXPECT_FALSE(WhileLoopSimplifier().Run(&module()).ValueOrDie());
+  auto m = ParseAndReturnVerifiedModule(hlo_string).ValueOrDie();
+  EXPECT_FALSE(WhileLoopSimplifier().Run(m.get()).ValueOrDie());
 }
 
 TEST_F(WhileLoopSimplifierTest,
@@ -473,8 +470,8 @@ TEST_F(WhileLoopSimplifierTest,
   }
   )";
 
-  ParseAndVerifyModule(hlo_string);
-  EXPECT_FALSE(WhileLoopSimplifier().Run(&module()).ValueOrDie());
+  auto m = ParseAndReturnVerifiedModule(hlo_string).ValueOrDie();
+  EXPECT_FALSE(WhileLoopSimplifier().Run(m.get()).ValueOrDie());
 }
 
 TEST_F(WhileLoopSimplifierTest, LoopWithArrayConstantNotSimplified) {
@@ -505,8 +502,65 @@ TEST_F(WhileLoopSimplifierTest, LoopWithArrayConstantNotSimplified) {
   }
   )";
 
-  ParseAndVerifyModule(hlo_string);
-  EXPECT_FALSE(WhileLoopSimplifier().Run(&module()).ValueOrDie());
+  auto m = ParseAndReturnVerifiedModule(hlo_string).ValueOrDie();
+  EXPECT_FALSE(WhileLoopSimplifier().Run(m.get()).ValueOrDie());
+}
+
+TEST_F(WhileLoopSimplifierTest, FlattenNestedTuple) {
+  const string hlo_string = R"(
+  HloModule Test
+  Body {
+    param = ((s32[1]), (s32[2], s32[3], (s32[4]))) parameter(0)
+    ta = (s32[1]) get-tuple-element(param), index=0
+    a = s32[1] get-tuple-element(ta), index=0
+    a.1 = s32[1] add(a, a)
+    tbcd = (s32[2], s32[3], (s32[4])) get-tuple-element(param), index=1
+    ROOT tuple = ((s32[1]), (s32[2], s32[3], (s32[4]))) tuple(ta, tbcd)
+  }
+  Cond {
+    param = ((s32[1]), (s32[2], s32[3], (s32[4]))) parameter(0)
+    ROOT cond = pred[] constant(true)
+  }
+  ENTRY Loop {
+    a = s32[1] constant({0})
+    b = s32[2] constant({0,1})
+    c = s32[3] constant({0,1,2})
+    d = s32[4] constant({0,1,2,3})
+    ta = (s32[1]) tuple(a)
+    td = (s32[4]) tuple(d)
+    tbcd = (s32[2], s32[3], (s32[4])) tuple(b, c, td)
+    init = ((s32[1]), (s32[2], s32[3], (s32[4]))) tuple(ta, tbcd)
+    ROOT while = ((s32[1]), (s32[2], s32[3], (s32[4]))) while(init),
+      condition=Cond, body=Body
+  })";
+
+  auto m = ParseAndReturnVerifiedModule(hlo_string).ValueOrDie();
+  EXPECT_TRUE(WhileLoopSimplifier().Run(m.get()).ValueOrDie());
+  // DCE away the old loop so there's just one while loop in the module, making
+  // it easy to find.
+  EXPECT_TRUE(HloDCE().Run(m.get()).ok());
+
+  const auto& instrs = m->entry_computation()->instructions();
+  HloInstruction* new_while =
+      *absl::c_find_if(instrs, [](const HloInstruction* instr) {
+        return instr->opcode() == HloOpcode::kWhile;
+      });
+  Shape flat_tuple =
+      ShapeUtil::ParseShapeString("(s32[1], s32[2], s32[3], s32[4])")
+          .ValueOrDie();
+  SCOPED_TRACE(m->ToString());
+  EXPECT_TRUE(ShapeUtil::Equal(new_while->shape(), flat_tuple));
+  EXPECT_TRUE(ShapeUtil::Equal(
+      new_while->while_body()->root_instruction()->shape(), flat_tuple));
+  EXPECT_TRUE(ShapeUtil::Equal(
+      new_while->while_body()->parameter_instruction(0)->shape(), flat_tuple));
+  EXPECT_TRUE(ShapeUtil::Equal(
+      new_while->while_condition()->parameter_instruction(0)->shape(),
+      flat_tuple));
+  EXPECT_TRUE(ShapeUtil::Equal(
+      m->entry_computation()->root_instruction()->shape(),
+      ShapeUtil::ParseShapeString("((s32[1]), (s32[2], s32[3], (s32[4])))")
+          .ValueOrDie()));
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/xla/service/while_util.cc b/tensorflow/compiler/xla/service/while_util.cc
index 1f583ca44b7..039ccda7322 100644
--- a/tensorflow/compiler/xla/service/while_util.cc
+++ b/tensorflow/compiler/xla/service/while_util.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/while_util.h"
 #include "absl/algorithm/container.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/inlined_vector.h"
 #include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
@@ -270,4 +272,17 @@ static Shape MakeLoopStateShape(const WhileUtil::LoopStateTy& init_values) {
   return result;
 }
 
+/*static*/ absl::flat_hash_map<int64, absl::InlinedVector<HloInstruction*, 1>>
+WhileUtil::GetGTEsMapForWhileConditional(
+    const HloComputation& while_conditional) {
+  absl::flat_hash_map<int64, absl::InlinedVector<HloInstruction*, 1>> result;
+  for (HloInstruction* user :
+       while_conditional.parameter_instruction(0)->users()) {
+    if (user->opcode() == HloOpcode::kGetTupleElement) {
+      result[user->tuple_index()].push_back(user);
+    }
+  }
+  return result;
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/while_util.h b/tensorflow/compiler/xla/service/while_util.h
index 524dcec5f12..cba41ccd8b1 100644
--- a/tensorflow/compiler/xla/service/while_util.h
+++ b/tensorflow/compiler/xla/service/while_util.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_WHILE_UTIL_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_WHILE_UTIL_H_
 
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/inlined_vector.h"
 #include "tensorflow/compiler/xla/service/call_inliner.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 
@@ -85,6 +87,13 @@ class WhileUtil {
   // Assumes `while_body` is the body computation of the while loop in question.
   static std::vector<HloInstruction*> GetInvariantGTEsForWhileBody(
       const HloComputation& while_body);
+
+  // Returns a map of index to GetTupleElement instructions in
+  // `while_conditional` that access elements in the parameter tuple. Assumes
+  // `while_conditional` is the conditional computation of the while loop in
+  // question.
+  static absl::flat_hash_map<int64, absl::InlinedVector<HloInstruction*, 1>>
+  GetGTEsMapForWhileConditional(const HloComputation& while_conditional);
 };
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/service/zero_sized_hlo_elimination_test.cc b/tensorflow/compiler/xla/service/zero_sized_hlo_elimination_test.cc
index b9ef18892d7..a546a6d39cc 100644
--- a/tensorflow/compiler/xla/service/zero_sized_hlo_elimination_test.cc
+++ b/tensorflow/compiler/xla/service/zero_sized_hlo_elimination_test.cc
@@ -45,7 +45,8 @@ class ZeroSizedHloEliminationTest : public HloTestBase {
                 0, ShapeUtil::MakeShape(F32, {3, 0}), "zero sized param"))) {}
 
   StatusOr<bool> RunZeroSizedElimination() {
-    auto module = CreateNewModule("zero_sized_elimination_test_module");
+    auto module =
+        CreateNewUnverifiedModule("zero_sized_elimination_test_module");
     module->AddEntryComputation(builder_.Build());
     return ZeroSizedHloElimination{}.Run(module.get());
   }
diff --git a/tensorflow/compiler/xla/service_interface.h b/tensorflow/compiler/xla/service_interface.h
index 14c35e7b84f..33edbd1b20d 100644
--- a/tensorflow/compiler/xla/service_interface.h
+++ b/tensorflow/compiler/xla/service_interface.h
@@ -47,8 +47,11 @@ class ServiceInterface {
   virtual Status ResetDevice(const ResetDeviceRequest* arg,
                              ResetDeviceResponse* result) = 0;
 
-  virtual Status ExecuteGraph(const ExecuteGraphRequest* arg,
-                              ExecuteResponse* result) = 0;
+  virtual Status Compile(const CompileRequest* arg,
+                         CompileResponse* result) = 0;
+
+  virtual Status Execute(const ExecuteRequest* arg,
+                         ExecuteResponse* result) = 0;
 
   virtual Status ExecuteGraphParallel(const ExecuteGraphParallelRequest* arg,
                                       ExecuteParallelResponse* result) = 0;
diff --git a/tensorflow/compiler/xla/shape_util.cc b/tensorflow/compiler/xla/shape_util.cc
index 17120e610cb..d0c35d8dee4 100644
--- a/tensorflow/compiler/xla/shape_util.cc
+++ b/tensorflow/compiler/xla/shape_util.cc
@@ -74,6 +74,11 @@ std::ostream& operator<<(std::ostream& out, const ShapeIndexView& shape_index) {
   return out;
 }
 
+bool ShapeIndexView::StartsWith(ShapeIndexView prefix) const {
+  return size() >= prefix.size() &&
+         indices_.subspan(0, prefix.size()) == prefix.indices_;
+}
+
 namespace {
 
 // Returns whether the given primitive type corresponds to an array shape.
diff --git a/tensorflow/compiler/xla/shape_util.h b/tensorflow/compiler/xla/shape_util.h
index 191ab04759f..a7a3026cf3f 100644
--- a/tensorflow/compiler/xla/shape_util.h
+++ b/tensorflow/compiler/xla/shape_util.h
@@ -147,6 +147,9 @@ class ShapeIndexView {
 
   string ToString() const;
 
+  // Returns true if this shape index starts with 'prefix'.
+  bool StartsWith(ShapeIndexView prefix) const;
+
  private:
   absl::Span<const int64> indices_;
 };
diff --git a/tensorflow/compiler/xla/tests/BUILD b/tensorflow/compiler/xla/tests/BUILD
index d395c9a4cee..db34d34f969 100644
--- a/tensorflow/compiler/xla/tests/BUILD
+++ b/tensorflow/compiler/xla/tests/BUILD
@@ -44,7 +44,7 @@ cc_library(
     testonly = True,
     srcs = ["xla_internal_test_main.cc"],
     deps = [
-        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
+        "//tensorflow/compiler/xla:debug_options_flags",
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
         "@com_google_absl//absl/strings",
@@ -117,12 +117,12 @@ cc_library(
     deps = [
         ":literal_test_util",
         ":test_utils",
+        "//tensorflow/compiler/xla:debug_options_flags",
         "//tensorflow/compiler/xla:shape_layout",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/compiler/xla/service:backend",
         "//tensorflow/compiler/xla/service:computation_layout",
         "//tensorflow/compiler/xla/service:hlo",
@@ -141,44 +141,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "hlo_verified_test_base",
-    testonly = True,
-    srcs = ["hlo_verified_test_base.cc"],
-    hdrs = ["hlo_verified_test_base.h"],
-    deps = [
-        ":hlo_test_base",
-        "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla:status_macros",
-        "//tensorflow/compiler/xla/service:hlo",
-        "//tensorflow/compiler/xla/service:hlo_parser",
-        "//tensorflow/compiler/xla/service:hlo_verifier",
-        "//tensorflow/core:lib",
-        "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/memory",
-    ],
-)
-
-tf_cc_test(
-    name = "hlo_verified_test_base_test",
-    srcs = ["hlo_verified_test_base_test.cc"],
-    deps = [
-        ":hlo_test_base",
-        ":hlo_verified_test_base",
-        ":test_macros_cpu",
-        ":test_utils",
-        "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla/client:xla_builder",
-        "//tensorflow/compiler/xla/client:xla_computation",
-        "//tensorflow/compiler/xla/service:hlo",
-        "//tensorflow/compiler/xla/service:hlo_parser",
-        "//tensorflow/compiler/xla/service:hlo_verifier",
-        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:test",
-    ],
-)
-
 tf_cc_binary(
     name = "local_client_aot_test_helper",
     srcs = ["local_client_aot_test_helper.cc"],
@@ -868,7 +830,8 @@ xla_test(
     name = "convolution_test",
     timeout = "long",
     srcs = ["convolution_test.cc"],
-    shard_count = 25,
+    shard_count = 40,
+    tags = ["optonly"],
     deps = CONVOLUTION_TEST_DEPS + [
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
diff --git a/tensorflow/compiler/xla/tests/broadcast_test.cc b/tensorflow/compiler/xla/tests/broadcast_test.cc
index 9966e4606ef..9930bfc95c2 100644
--- a/tensorflow/compiler/xla/tests/broadcast_test.cc
+++ b/tensorflow/compiler/xla/tests/broadcast_test.cc
@@ -42,7 +42,7 @@ XLA_TEST_F(BroadcastTest, BroadcastScalarToScalar) {
       ShapeUtil::MakeShape(F32, {}), input, {}));
 
   // Create HLO module, compile, and execute.
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewUnverifiedModule();
   hlo_module->AddEntryComputation(builder.Build());
   auto result = ExecuteAndTransfer(std::move(hlo_module), {});
 
@@ -58,7 +58,7 @@ XLA_TEST_F(BroadcastTest, BroadcastScalarTo2D) {
       ShapeUtil::MakeShape(F32, {2, 2}), input, {}));
 
   // Create HLO module, compile, and execute.
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewUnverifiedModule();
   hlo_module->AddEntryComputation(builder.Build());
   auto result = ExecuteAndTransfer(std::move(hlo_module), {});
 
@@ -81,7 +81,7 @@ XLA_TEST_F(BroadcastTest, BroadcastVectorTo2D) {
   builder.AddInstruction(HloInstruction::CreateTuple({element1, element2}));
 
   // Create HLO module, compile, and execute.
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewUnverifiedModule();
   hlo_module->AddEntryComputation(builder.Build());
   auto result = ExecuteAndTransfer(std::move(hlo_module), {});
 
@@ -102,7 +102,7 @@ XLA_TEST_F(BroadcastTest, Broadcast2DTo2D) {
       ShapeUtil::MakeShape(F32, {2, 2}), input, {0, 1}));
 
   // Create HLO module, compile, and execute.
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewUnverifiedModule();
   hlo_module->AddEntryComputation(builder.Build());
   auto result = ExecuteAndTransfer(std::move(hlo_module), {});
 
@@ -121,7 +121,7 @@ XLA_TEST_F(BroadcastTest, Broadcast2DTo2DTranspose) {
       ShapeUtil::MakeShape(F32, {2, 2}), input, {1, 0}));
 
   // Create HLO module, compile, and execute.
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewUnverifiedModule();
   hlo_module->AddEntryComputation(builder.Build());
   auto result = ExecuteAndTransfer(std::move(hlo_module), {});
 
@@ -138,7 +138,7 @@ XLA_TEST_F(BroadcastTest, Broadcast2DTo3D) {
       ShapeUtil::MakeShape(F32, {2, 3, 2}), input, {0, 2}));
 
   // Create HLO module, compile, and execute.
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewUnverifiedModule();
   hlo_module->AddEntryComputation(builder.Build());
   auto result = ExecuteAndTransfer(std::move(hlo_module), {});
 
@@ -158,7 +158,7 @@ TEST_F(BroadcastTest, Broadcast_R1_2_To_R4_2x2x3x3) {
       ShapeUtil::MakeShape(F32, {2, 2, 3, 3}), input, {1}));
 
   // Create HLO module, compile, and execute.
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewUnverifiedModule();
   hlo_module->AddEntryComputation(builder.Build());
   auto result = ExecuteAndTransfer(std::move(hlo_module), {});
 
@@ -183,7 +183,7 @@ TEST_F(BroadcastTest, Broadcast_R1_1025_To_R4_3x3x3x1025) {
       ShapeUtil::MakeShape(F32, {3, 3, 3, r1_size}), input, {3}));
 
   // Create HLO module, compile, and execute.
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewUnverifiedModule();
   hlo_module->AddEntryComputation(builder.Build());
   auto result = ExecuteAndTransfer(std::move(hlo_module), {});
 
@@ -214,7 +214,7 @@ XLA_TEST_F(BroadcastTest, Broadcast_R1_64_To_R4_32x64x7x7) {
       ShapeUtil::MakeShape(F32, {32, 64, 7, 7}), input, {1}));
 
   // Create HLO module, compile, and execute.
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewUnverifiedModule();
   hlo_module->AddEntryComputation(builder.Build());
   auto result = ExecuteAndTransfer(std::move(hlo_module), {});
 
@@ -230,7 +230,7 @@ TEST_F(BroadcastTest, Broadcast_R0_to_R4_64x64x3x3) {
       ShapeUtil::MakeShape(F32, {64, 64, 3, 3}), input, {}));
 
   // Create HLO module, compile, and execute.
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewUnverifiedModule();
   hlo_module->AddEntryComputation(builder.Build());
   LOG(INFO) << hlo_module->ToString();
   auto result = ExecuteAndTransfer(std::move(hlo_module), {});
@@ -253,7 +253,7 @@ TEST_F(BroadcastTest, Broadcast_R2_2x2_To_R4_3x3x2x2) {
       ShapeUtil::MakeShape(F32, {3, 3, 2, 2}), input, {2, 3}));
 
   // Create HLO module, compile, and execute.
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewUnverifiedModule();
   hlo_module->AddEntryComputation(builder.Build());
   auto result = ExecuteAndTransfer(std::move(hlo_module), {});
 
@@ -287,7 +287,7 @@ TEST_F(BroadcastTest, Broadcast_R3_2x3x4_to_R4_2x3x4x5) {
       ShapeUtil::MakeShape(F32, {2, 3, 4, 5}), input, {0, 1, 2}));
 
   // Create HLO module, compile, and execute.
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewUnverifiedModule();
   hlo_module->AddEntryComputation(builder.Build());
   auto result = ExecuteAndTransfer(std::move(hlo_module), {});
 
diff --git a/tensorflow/compiler/xla/tests/convolution_test.cc b/tensorflow/compiler/xla/tests/convolution_test.cc
index 3aebf784664..211d004ec8c 100644
--- a/tensorflow/compiler/xla/tests/convolution_test.cc
+++ b/tensorflow/compiler/xla/tests/convolution_test.cc
@@ -596,6 +596,272 @@ TYPED_TEST(Convolve2D_1x3x3x5_3x3x1x15_Depthwise_Valid, Types) {
   this->RunTest();
 }
 
+template <typename T>
+class Convolve2D_1x4x4x5_3x3x1x5_Depthwise_Valid : public ConvolutionTest {
+ public:
+  void RunTest() {
+    XlaBuilder builder(TestName());
+    std::vector<int64> input_dims = {1, 4, 4, 5};
+    std::vector<int64> filter_dims = {3, 3, 1, 5};
+    Shape input_shape = ShapeUtil::MakeShapeWithType<T>(input_dims);
+    Shape filter_shape = ShapeUtil::MakeShapeWithType<T>(filter_dims);
+    {
+      auto input = Parameter(&builder, 0, input_shape, "input");
+      auto filter = Parameter(&builder, 1, filter_shape, "filter");
+
+      // Tensorflow dimension numbers for 2D convolution.
+      ConvolutionDimensionNumbers dnums;
+      dnums.set_input_batch_dimension(0);
+      dnums.set_output_batch_dimension(0);
+      dnums.add_input_spatial_dimensions(1);
+      dnums.add_output_spatial_dimensions(1);
+      dnums.add_input_spatial_dimensions(2);
+      dnums.add_output_spatial_dimensions(2);
+      dnums.set_input_feature_dimension(3);
+      dnums.set_output_feature_dimension(3);
+      dnums.add_kernel_spatial_dimensions(0);
+      dnums.add_kernel_spatial_dimensions(1);
+      dnums.set_kernel_input_feature_dimension(2);
+      dnums.set_kernel_output_feature_dimension(3);
+
+      ConvWithGeneralDimensions(input, filter, {1, 1}, Padding::kValid, dnums,
+                                /*feature_group_count=*/5);
+    }
+
+    std::vector<T> input_elems(ShapeUtil::ElementsIn(input_shape));
+    iota_int_init_value(input_elems, 1);
+    auto input_r1 = LiteralUtil::CreateR1<T>(input_elems);
+    auto input_r4 = input_r1.Reshape(input_dims).ConsumeValueOrDie();
+
+    std::vector<T> filter_elems(ShapeUtil::ElementsIn(filter_shape));
+    iota_int_init_value(filter_elems, 1);
+    auto filter_r1 = LiteralUtil::CreateR1<T>(filter_elems);
+    auto filter_r4 = filter_r1.Reshape(filter_dims).ConsumeValueOrDie();
+
+    auto expected_r1 = LiteralUtil::CreateR1<T>(
+        {static_cast<T>(6864),  static_cast<T>(7296),  static_cast<T>(7746),
+         static_cast<T>(8214),  static_cast<T>(8700),  static_cast<T>(7809),
+         static_cast<T>(8286),  static_cast<T>(8781),  static_cast<T>(9294),
+         static_cast<T>(9825),  static_cast<T>(10644), static_cast<T>(11256),
+         static_cast<T>(11886), static_cast<T>(12534), static_cast<T>(13200),
+         static_cast<T>(11589), static_cast<T>(12246), static_cast<T>(12921),
+         static_cast<T>(13614), static_cast<T>(14325)});
+    auto expected_r4 = expected_r1.Reshape({1, 2, 2, 5}).ConsumeValueOrDie();
+
+    auto input_literal =
+        client_->TransferToServer(input_r4).ConsumeValueOrDie();
+    auto filter_literal =
+        client_->TransferToServer(filter_r4).ConsumeValueOrDie();
+
+    ComputeAndCompareLiteral(&builder, expected_r4,
+                             {input_literal.get(), filter_literal.get()},
+                             error_spec_);
+
+    auto filter_r = filter_r1.Reshape(filter_dims);
+  }
+};
+
+TYPED_TEST_CASE(Convolve2D_1x4x4x5_3x3x1x5_Depthwise_Valid, TestTypes);
+TYPED_TEST(Convolve2D_1x4x4x5_3x3x1x5_Depthwise_Valid, Types) {
+  this->RunTest();
+}
+
+template <typename T>
+class Convolve2D_1x4x4x512_3x3x1x512_Depthwise_Valid : public ConvolutionTest {
+ public:
+  void RunTest() {
+    XlaBuilder builder(TestName());
+    std::vector<int64> input_dims = {1, 4, 4, 512};
+    std::vector<int64> filter_dims = {3, 3, 1, 512};
+    Shape input_shape = ShapeUtil::MakeShapeWithType<T>(input_dims);
+    Shape filter_shape = ShapeUtil::MakeShapeWithType<T>(filter_dims);
+    {
+      auto input = Parameter(&builder, 0, input_shape, "input");
+      auto filter = Parameter(&builder, 1, filter_shape, "filter");
+
+      // Tensorflow dimension numbers for 2D convolution.
+      ConvolutionDimensionNumbers dnums;
+      dnums.set_input_batch_dimension(0);
+      dnums.set_output_batch_dimension(0);
+      dnums.add_input_spatial_dimensions(1);
+      dnums.add_output_spatial_dimensions(1);
+      dnums.add_input_spatial_dimensions(2);
+      dnums.add_output_spatial_dimensions(2);
+      dnums.set_input_feature_dimension(3);
+      dnums.set_output_feature_dimension(3);
+      dnums.add_kernel_spatial_dimensions(0);
+      dnums.add_kernel_spatial_dimensions(1);
+      dnums.set_kernel_input_feature_dimension(2);
+      dnums.set_kernel_output_feature_dimension(3);
+
+      ConvWithGeneralDimensions(input, filter, {1, 1}, Padding::kValid, dnums,
+                                /*feature_group_count=*/512);
+    }
+
+    std::vector<T> input_elems(ShapeUtil::ElementsIn(input_shape),
+                               static_cast<T>(1));
+    auto input_r1 = LiteralUtil::CreateR1<T>(input_elems);
+    auto input_r4 = input_r1.Reshape(input_dims).ConsumeValueOrDie();
+
+    std::vector<T> filter_elems(ShapeUtil::ElementsIn(filter_shape),
+                                static_cast<T>(2));
+    auto filter_r1 = LiteralUtil::CreateR1<T>(filter_elems);
+    auto filter_r4 = filter_r1.Reshape(filter_dims).ConsumeValueOrDie();
+
+    std::vector<T> output_elems(2048, static_cast<T>(18));
+
+    auto expected_r1 = LiteralUtil::CreateR1<T>(output_elems);
+    auto expected_r4 = expected_r1.Reshape({1, 2, 2, 512}).ConsumeValueOrDie();
+
+    auto input_literal =
+        client_->TransferToServer(input_r4).ConsumeValueOrDie();
+    auto filter_literal =
+        client_->TransferToServer(filter_r4).ConsumeValueOrDie();
+
+    ComputeAndCompareLiteral(&builder, expected_r4,
+                             {input_literal.get(), filter_literal.get()},
+                             error_spec_);
+
+    auto filter_r = filter_r1.Reshape(filter_dims);
+  }
+};
+
+TYPED_TEST_CASE(Convolve2D_1x4x4x512_3x3x1x512_Depthwise_Valid, TestTypes);
+TYPED_TEST(Convolve2D_1x4x4x512_3x3x1x512_Depthwise_Valid, Types) {
+  this->RunTest();
+}
+
+template <typename T>
+class Convolve2D_1x4x4x160_3x3x1x160_Depthwise_Valid : public ConvolutionTest {
+ public:
+  void RunTest() {
+    XlaBuilder builder(TestName());
+    std::vector<int64> input_dims = {1, 4, 4, 160};
+    std::vector<int64> filter_dims = {3, 3, 1, 160};
+    Shape input_shape = ShapeUtil::MakeShapeWithType<T>(input_dims);
+    Shape filter_shape = ShapeUtil::MakeShapeWithType<T>(filter_dims);
+    {
+      auto input = Parameter(&builder, 0, input_shape, "input");
+      auto filter = Parameter(&builder, 1, filter_shape, "filter");
+
+      // Tensorflow dimension numbers for 2D convolution.
+      ConvolutionDimensionNumbers dnums;
+      dnums.set_input_batch_dimension(0);
+      dnums.set_output_batch_dimension(0);
+      dnums.add_input_spatial_dimensions(1);
+      dnums.add_output_spatial_dimensions(1);
+      dnums.add_input_spatial_dimensions(2);
+      dnums.add_output_spatial_dimensions(2);
+      dnums.set_input_feature_dimension(3);
+      dnums.set_output_feature_dimension(3);
+      dnums.add_kernel_spatial_dimensions(0);
+      dnums.add_kernel_spatial_dimensions(1);
+      dnums.set_kernel_input_feature_dimension(2);
+      dnums.set_kernel_output_feature_dimension(3);
+
+      ConvWithGeneralDimensions(input, filter, {1, 1}, Padding::kValid, dnums,
+                                /*feature_group_count=*/160);
+    }
+
+    std::vector<T> input_elems(ShapeUtil::ElementsIn(input_shape),
+                               static_cast<T>(1));
+    auto input_r1 = LiteralUtil::CreateR1<T>(input_elems);
+    auto input_r4 = input_r1.Reshape(input_dims).ConsumeValueOrDie();
+
+    std::vector<T> filter_elems(ShapeUtil::ElementsIn(filter_shape),
+                                static_cast<T>(2));
+    auto filter_r1 = LiteralUtil::CreateR1<T>(filter_elems);
+    auto filter_r4 = filter_r1.Reshape(filter_dims).ConsumeValueOrDie();
+
+    std::vector<T> output_elems(640, static_cast<T>(18));
+
+    auto expected_r1 = LiteralUtil::CreateR1<T>(output_elems);
+    auto expected_r4 = expected_r1.Reshape({1, 2, 2, 160}).ConsumeValueOrDie();
+
+    auto input_literal =
+        client_->TransferToServer(input_r4).ConsumeValueOrDie();
+    auto filter_literal =
+        client_->TransferToServer(filter_r4).ConsumeValueOrDie();
+
+    ComputeAndCompareLiteral(&builder, expected_r4,
+                             {input_literal.get(), filter_literal.get()},
+                             error_spec_);
+
+    auto filter_r = filter_r1.Reshape(filter_dims);
+  }
+};
+
+TYPED_TEST_CASE(Convolve2D_1x4x4x160_3x3x1x160_Depthwise_Valid, TestTypes);
+TYPED_TEST(Convolve2D_1x4x4x160_3x3x1x160_Depthwise_Valid, Types) {
+  this->RunTest();
+}
+
+template <typename T>
+class Convolve2D_1x4x4x1024_3x3x1x1024_Depthwise_Valid
+    : public ConvolutionTest {
+ public:
+  void RunTest() {
+    XlaBuilder builder(TestName());
+    std::vector<int64> input_dims = {1, 4, 4, 1024};
+    std::vector<int64> filter_dims = {3, 3, 1, 1024};
+    Shape input_shape = ShapeUtil::MakeShapeWithType<T>(input_dims);
+    Shape filter_shape = ShapeUtil::MakeShapeWithType<T>(filter_dims);
+    {
+      auto input = Parameter(&builder, 0, input_shape, "input");
+      auto filter = Parameter(&builder, 1, filter_shape, "filter");
+
+      // Tensorflow dimension numbers for 2D convolution.
+      ConvolutionDimensionNumbers dnums;
+      dnums.set_input_batch_dimension(0);
+      dnums.set_output_batch_dimension(0);
+      dnums.add_input_spatial_dimensions(1);
+      dnums.add_output_spatial_dimensions(1);
+      dnums.add_input_spatial_dimensions(2);
+      dnums.add_output_spatial_dimensions(2);
+      dnums.set_input_feature_dimension(3);
+      dnums.set_output_feature_dimension(3);
+      dnums.add_kernel_spatial_dimensions(0);
+      dnums.add_kernel_spatial_dimensions(1);
+      dnums.set_kernel_input_feature_dimension(2);
+      dnums.set_kernel_output_feature_dimension(3);
+
+      ConvWithGeneralDimensions(input, filter, {1, 1}, Padding::kValid, dnums,
+                                /*feature_group_count=*/1024);
+    }
+
+    std::vector<T> input_elems(ShapeUtil::ElementsIn(input_shape),
+                               static_cast<T>(1));
+    auto input_r1 = LiteralUtil::CreateR1<T>(input_elems);
+    auto input_r4 = input_r1.Reshape(input_dims).ConsumeValueOrDie();
+
+    std::vector<T> filter_elems(ShapeUtil::ElementsIn(filter_shape),
+                                static_cast<T>(2));
+    auto filter_r1 = LiteralUtil::CreateR1<T>(filter_elems);
+    auto filter_r4 = filter_r1.Reshape(filter_dims).ConsumeValueOrDie();
+
+    std::vector<T> output_elems(4096, static_cast<T>(18));
+
+    auto expected_r1 = LiteralUtil::CreateR1<T>(output_elems);
+    auto expected_r4 = expected_r1.Reshape({1, 2, 2, 1024}).ConsumeValueOrDie();
+
+    auto input_literal =
+        client_->TransferToServer(input_r4).ConsumeValueOrDie();
+    auto filter_literal =
+        client_->TransferToServer(filter_r4).ConsumeValueOrDie();
+
+    ComputeAndCompareLiteral(&builder, expected_r4,
+                             {input_literal.get(), filter_literal.get()},
+                             error_spec_);
+
+    auto filter_r = filter_r1.Reshape(filter_dims);
+  }
+};
+
+TYPED_TEST_CASE(Convolve2D_1x4x4x1024_3x3x1x1024_Depthwise_Valid, TestTypes);
+TYPED_TEST(Convolve2D_1x4x4x1024_3x3x1x1024_Depthwise_Valid, Types) {
+  this->RunTest();
+}
+
 template <typename T>
 class Convolve2D_1x2x2x6_2x2x1x12_Grouped_Valid : public ConvolutionTest {
  public:
diff --git a/tensorflow/compiler/xla/tests/copy_test.cc b/tensorflow/compiler/xla/tests/copy_test.cc
index 1407e68d9a3..3622f2c1e84 100644
--- a/tensorflow/compiler/xla/tests/copy_test.cc
+++ b/tensorflow/compiler/xla/tests/copy_test.cc
@@ -45,7 +45,7 @@ class CopyOpTest : public HloTestBase {
     builder.AddInstruction(HloInstruction::CreateUnary(
         constant->shape(), HloOpcode::kCopy, constant));
     auto computation = builder.Build();
-    auto module = CreateNewModule();
+    auto module = CreateNewUnverifiedModule();
     module->AddEntryComputation(std::move(computation));
 
     Literal result = ExecuteAndTransfer(std::move(module), {});
@@ -98,7 +98,7 @@ XLA_TEST_F(CopyOpTest, CopyParameterScalar) {
 
   auto computation = builder.Build();
 
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   module->AddEntryComputation(std::move(computation));
 
   Literal result = ExecuteAndTransfer(std::move(module), {&literal});
@@ -119,7 +119,7 @@ XLA_TEST_F(CopyOpTest, CopyConstantR2Twice) {
 
   auto computation = builder.Build();
 
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   module->AddEntryComputation(std::move(computation));
   Literal result = ExecuteAndTransfer(std::move(module), {});
   LiteralTestUtil::ExpectR2Near<float>({{1.0, 2.0}, {3.0, 4.0}}, result,
@@ -143,7 +143,7 @@ XLA_TEST_F(CopyOpTest, CopyConstantR2DifferentLayouts) {
 
   std::unique_ptr<HloComputation> computation = builder.Build();
 
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   module->AddEntryComputation(std::move(computation));
   Literal result = ExecuteAndTransfer(std::move(module), {});
 
@@ -175,7 +175,7 @@ void CopyOpTest::TestCopyConstantLayout021(size_t n1, size_t n2, size_t n3) {
 
   std::unique_ptr<HloComputation> computation = builder.Build();
 
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   module->AddEntryComputation(std::move(computation));
   ForceResultLayout(module.get(), LayoutUtil::MakeLayout({1, 2, 0}));
   Literal result = ExecuteAndTransfer(std::move(module), {});
@@ -209,7 +209,7 @@ void CopyOpTest::TestCopyConstantLayoutR4(size_t n1, size_t n2, size_t n3,
 
   std::unique_ptr<HloComputation> computation = builder.Build();
 
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   module->AddEntryComputation(std::move(computation));
   ForceResultLayout(module.get(), LayoutUtil::MakeLayout(permutation));
   Literal result = ExecuteAndTransfer(std::move(module), {});
diff --git a/tensorflow/compiler/xla/tests/custom_call_test.cc b/tensorflow/compiler/xla/tests/custom_call_test.cc
index 001490c6a8c..738b6442354 100644
--- a/tensorflow/compiler/xla/tests/custom_call_test.cc
+++ b/tensorflow/compiler/xla/tests/custom_call_test.cc
@@ -70,7 +70,7 @@ class CustomCallTest : public HloTestBase {
 };
 
 XLA_TEST_F(CustomCallTest, DISABLED_ON_GPU(CustomCallR0F32Add2)) {
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   auto builder = HloComputation::Builder(TestName());
 
   auto constant = builder.AddInstruction(
@@ -85,7 +85,7 @@ XLA_TEST_F(CustomCallTest, DISABLED_ON_GPU(CustomCallR0F32Add2)) {
 }
 
 XLA_TEST_F(CustomCallTest, DISABLED_ON_GPU(CustomCallR2F32Reduce)) {
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   auto builder = HloComputation::Builder(TestName());
 
   Array2D<float> array(2, 2);
@@ -106,7 +106,7 @@ XLA_TEST_F(CustomCallTest, DISABLED_ON_GPU(CustomCallR2F32Reduce)) {
 }
 
 XLA_TEST_F(CustomCallTest, DISABLED_ON_GPU(UsedInOtherComputations)) {
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   auto b = HloComputation::Builder(TestName());
 
   auto input = b.AddInstruction(
@@ -130,7 +130,7 @@ XLA_TEST_F(CustomCallTest, DISABLED_ON_GPU(UsedInOtherComputations)) {
 }
 
 XLA_TEST_F(CustomCallTest, DISABLED_ON_GPU(InputAndOutputLayoutDiffer)) {
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   auto b = HloComputation::Builder(TestName());
 
   auto input =
@@ -155,7 +155,7 @@ XLA_TEST_F(CustomCallTest, DISABLED_ON_GPU(LayoutConstrained)) {
   // The argument and result of the computation are set to different layouts,
   // but the custom call is layout constrained to a fixed operand and result
   // layout, so the correct result should be produced.
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   auto b = HloComputation::Builder(TestName());
 
   auto input =
diff --git a/tensorflow/compiler/xla/tests/fusion_test.cc b/tensorflow/compiler/xla/tests/fusion_test.cc
index 4d4b676a538..d1fddf9d6b4 100644
--- a/tensorflow/compiler/xla/tests/fusion_test.cc
+++ b/tensorflow/compiler/xla/tests/fusion_test.cc
@@ -81,7 +81,7 @@ class FusionTest : public HloTestBase {
     }
 
     auto builder = HloComputation::Builder(TestName());
-    auto hlo_module = CreateNewModule();
+    auto hlo_module = CreateNewUnverifiedModule();
 
     auto prim_type = primitive_util::NativeToPrimitiveType<T>();
 
@@ -183,7 +183,7 @@ XLA_TEST_F(FusionTest, Test) {
   //                     (-{{1.0, 1.0, 1.0}, {0.0, 0.0, 0.0}}),
   //              {{0.5, 0.5, 0.5}, {0.5, 0.5, 0.5}})) = {{0.5}, {2.72}}
   auto builder = HloComputation::Builder(TestName());
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewUnverifiedModule();
   auto const0 = builder.AddInstruction(HloInstruction::CreateConstant(
       LiteralUtil::CreateR2<float>({{1.0}, {2.0}, {3.0}})));
   auto const1 = builder.AddInstruction(HloInstruction::CreateConstant(
@@ -231,7 +231,7 @@ XLA_TEST_F(FusionTest, Parameter) {
   // Build a computation and fuse part of it so the fusion instruction has an
   // operand parameter.
   auto builder = HloComputation::Builder(TestName());
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewUnverifiedModule();
   auto const0 = builder.AddInstruction(HloInstruction::CreateConstant(
       LiteralUtil::CreateR2<float>({{1.0, 2.0, 3.0}})));
   auto copy1 = builder.AddInstruction(HloInstruction::CreateUnary(
@@ -266,7 +266,7 @@ XLA_TEST_F(FusionTest, RandomizedParallelPartition) {
       ShapeUtil::MakeShapeWithLayout(F32, {rand_dim0_size, dim1_size}, {1, 0});
   // Build simple fusion computation: y = x^2 (elementwise).
   auto builder = HloComputation::Builder(TestName());
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewUnverifiedModule();
 
   auto two = builder.AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(2.0)));
@@ -290,7 +290,7 @@ XLA_TEST_F(FusionTest, RandomizedParallelPartition) {
 
 XLA_TEST_F(FusionTest, BroadcastIntoBinaryOp) {
   auto builder = HloComputation::Builder(TestName());
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewUnverifiedModule();
   auto const_vector = builder.AddInstruction(HloInstruction::CreateConstant(
       LiteralUtil::CreateR1<float>({1.0, 2.0, 3.0})));
   auto const_array = builder.AddInstruction(HloInstruction::CreateConstant(
@@ -314,7 +314,7 @@ XLA_TEST_F(FusionTest, BroadcastIntoBinaryOp) {
 
 XLA_TEST_F(FusionTest, ReshapeToScalar) {
   auto builder = HloComputation::Builder(TestName());
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewUnverifiedModule();
   auto single_element_array = builder.AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::CreateR2<int32>({{5}})));
   auto reshape = builder.AddInstruction(HloInstruction::CreateReshape(
@@ -329,7 +329,7 @@ XLA_TEST_F(FusionTest, ReshapeToScalar) {
 
 XLA_TEST_F(FusionTest, Reshape_3by2_1by2by3) {
   auto builder = HloComputation::Builder(TestName());
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewUnverifiedModule();
   auto const0 = builder.AddInstruction(HloInstruction::CreateConstant(
       LiteralUtil::CreateR2<int32>({{1, 2}, {3, 4}, {5, 6}})));
   auto reshape1 = builder.AddInstruction(HloInstruction::CreateReshape(
@@ -344,7 +344,7 @@ XLA_TEST_F(FusionTest, Reshape_3by2_1by2by3) {
 
 XLA_TEST_F(FusionTest, Reshape_1by2by3_3by2) {
   auto builder = HloComputation::Builder(TestName());
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewUnverifiedModule();
   auto const0 = builder.AddInstruction(HloInstruction::CreateConstant(
       LiteralUtil::CreateR3<int32>({{{1, 2, 3}, {4, 5, 6}}})));
   auto reshape1 = builder.AddInstruction(
@@ -359,7 +359,7 @@ XLA_TEST_F(FusionTest, Reshape_1by2by3_3by2) {
 
 XLA_TEST_F(FusionTest, Reshape_1by1by1_) {
   auto builder = HloComputation::Builder(TestName());
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewUnverifiedModule();
   auto const0 = builder.AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::CreateR3<int32>({{{7}}})));
   auto reshape1 = builder.AddInstruction(
@@ -374,7 +374,7 @@ XLA_TEST_F(FusionTest, Reshape_1by1by1_) {
 
 XLA_TEST_F(FusionTest, Reshape__1by1by1) {
   auto builder = HloComputation::Builder(TestName());
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewUnverifiedModule();
   auto const0 = builder.AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::CreateR0<int32>(7)));
   auto reshape1 = builder.AddInstruction(HloInstruction::CreateReshape(
@@ -389,7 +389,7 @@ XLA_TEST_F(FusionTest, Reshape__1by1by1) {
 
 XLA_TEST_F(FusionTest, Reshape__) {
   auto builder = HloComputation::Builder(TestName());
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewUnverifiedModule();
   auto const0 = builder.AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::CreateR0<int32>(7)));
   auto reshape1 = builder.AddInstruction(
@@ -404,7 +404,7 @@ XLA_TEST_F(FusionTest, Reshape__) {
 
 XLA_TEST_F(FusionTest, Reshape_3by3_3by3) {
   auto builder = HloComputation::Builder(TestName());
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewUnverifiedModule();
   auto const0 = builder.AddInstruction(HloInstruction::CreateConstant(
       LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}})));
   auto reshape1 = builder.AddInstruction(
@@ -419,7 +419,7 @@ XLA_TEST_F(FusionTest, Reshape_3by3_3by3) {
 
 XLA_TEST_F(FusionTest, Transpose_2by3) {
   auto builder = HloComputation::Builder(TestName());
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewUnverifiedModule();
   auto const0 = builder.AddInstruction(HloInstruction::CreateConstant(
       LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}})));
   auto reshape1 = builder.AddInstruction(HloInstruction::CreateTranspose(
@@ -434,7 +434,7 @@ XLA_TEST_F(FusionTest, Transpose_2by3) {
 
 XLA_TEST_F(FusionTest, Transpose_3by3) {
   auto builder = HloComputation::Builder(TestName());
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewUnverifiedModule();
   auto const0 = builder.AddInstruction(HloInstruction::CreateConstant(
       LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}})));
   auto reshape1 = builder.AddInstruction(HloInstruction::CreateTranspose(
@@ -449,7 +449,7 @@ XLA_TEST_F(FusionTest, Transpose_3by3) {
 
 XLA_TEST_F(FusionTest, Reverse) {
   auto builder = HloComputation::Builder(TestName());
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewUnverifiedModule();
   auto const0 = builder.AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::CreateR1<int32>({1, 2, 3})));
   auto reverse1 = builder.AddInstruction(HloInstruction::CreateReverse(
@@ -465,7 +465,7 @@ XLA_TEST_F(FusionTest, Reverse) {
 
 XLA_TEST_F(FusionTest, ReverseNegate) {
   auto builder = HloComputation::Builder(TestName());
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewUnverifiedModule();
   auto const0 = builder.AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::CreateR1<int32>({1, 2, 3})));
   auto reverse1 = builder.AddInstruction(HloInstruction::CreateReverse(
@@ -483,7 +483,7 @@ XLA_TEST_F(FusionTest, ReverseNegate) {
 
 XLA_TEST_F(FusionTest, BroadcastNegate) {
   auto builder = HloComputation::Builder(TestName());
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewUnverifiedModule();
   auto const0 = builder.AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::CreateR0<int32>(1)));
   auto broadcast1 = builder.AddInstruction(HloInstruction::CreateBroadcast(
@@ -501,7 +501,7 @@ XLA_TEST_F(FusionTest, BroadcastNegate) {
 
 XLA_TEST_F(FusionTest, SliceNegate) {
   auto builder = HloComputation::Builder(TestName());
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewUnverifiedModule();
   auto const0 = builder.AddInstruction(HloInstruction::CreateConstant(
       LiteralUtil::CreateR1<int32>({1, 2, 3, 4})));
   auto slice1 = builder.AddInstruction(HloInstruction::CreateSlice(
@@ -519,7 +519,7 @@ XLA_TEST_F(FusionTest, SliceNegate) {
 
 XLA_TEST_F(FusionTest, DynamicSliceNegate) {
   auto builder = HloComputation::Builder(TestName());
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewUnverifiedModule();
   auto const0 = builder.AddInstruction(HloInstruction::CreateConstant(
       LiteralUtil::CreateR1<int32>({1, 2, 3, 4})));
   auto const1 = builder.AddInstruction(
@@ -541,7 +541,7 @@ XLA_TEST_F(FusionTest, DynamicSliceNegate) {
 
 XLA_TEST_F(FusionTest, ReshapeNegate) {
   auto builder = HloComputation::Builder(TestName());
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewUnverifiedModule();
   auto const0 = builder.AddInstruction(HloInstruction::CreateConstant(
       LiteralUtil::CreateR1<int32>({1, 2, 3, 4})));
   auto reshape1 = builder.AddInstruction(
@@ -559,7 +559,7 @@ XLA_TEST_F(FusionTest, ReshapeNegate) {
 
 XLA_TEST_F(FusionTest, TransposeNegate) {
   auto builder = HloComputation::Builder(TestName());
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewUnverifiedModule();
   auto const0 = builder.AddInstruction(HloInstruction::CreateConstant(
       LiteralUtil::CreateR2<int32>({{1, 2}, {3, 4}})));
   auto transpose1 = builder.AddInstruction(HloInstruction::CreateTranspose(
@@ -587,7 +587,7 @@ std::unique_ptr<HloComputation> MakeReduceTestComputation() {
 }
 
 XLA_TEST_F(FusionTest, DISABLED_ON_CPU(Reduce)) {
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewUnverifiedModule();
 
   auto builder = HloComputation::Builder(TestName());
   auto const0 = builder.AddInstruction(HloInstruction::CreateConstant(
@@ -607,7 +607,7 @@ XLA_TEST_F(FusionTest, DISABLED_ON_CPU(Reduce)) {
 }
 
 XLA_TEST_F(FusionTest, DISABLED_ON_CPU(ReduceImplicitBroadcast)) {
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewUnverifiedModule();
 
   auto builder = HloComputation::Builder(TestName());
   auto const0 = builder.AddInstruction(HloInstruction::CreateConstant(
@@ -630,7 +630,7 @@ XLA_TEST_F(FusionTest, DISABLED_ON_CPU(ReduceImplicitBroadcast)) {
 
 XLA_TEST_F(FusionTest, DISABLED_ON_CPU(ReduceWindow)) {
   auto builder = HloComputation::Builder(TestName());
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewUnverifiedModule();
   auto const0 = builder.AddInstruction(HloInstruction::CreateConstant(
       LiteralUtil::CreateR2<int32>({{2, 3, 5}, {7, 11, 13}, {17, 19, 23}})));
   auto const1 = builder.AddInstruction(
@@ -682,7 +682,7 @@ XLA_TEST_F(FusionTest, DISABLED_ON_CPU(ReduceWindow)) {
 // into a fusion, it should remain shared, rather than being duplicated
 // within the fusion.
 XLA_TEST_F(FusionTest, SharedConstant) {
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewUnverifiedModule();
 
   auto builder = HloComputation::Builder(TestName());
   auto const0 = builder.AddInstruction(
diff --git a/tensorflow/compiler/xla/tests/hlo_test_base.cc b/tensorflow/compiler/xla/tests/hlo_test_base.cc
index 7ab2ecda586..d8fa00272f8 100644
--- a/tensorflow/compiler/xla/tests/hlo_test_base.cc
+++ b/tensorflow/compiler/xla/tests/hlo_test_base.cc
@@ -23,8 +23,8 @@ limitations under the License.
 #include "absl/algorithm/container.h"
 #include "absl/memory/memory.h"
 #include "absl/types/span.h"
+#include "tensorflow/compiler/xla/debug_options_flags.h"
 #include "tensorflow/compiler/xla/layout_util.h"
-#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/service/platform_util.h"
@@ -85,6 +85,23 @@ ProgramShape GetProgramShapeWithLayout(const HloModule& module) {
 
 }  // namespace
 
+Status VerifiedHloModule::Verify() {
+  if (computation_count() == 0) {
+    // The computation was never built. Nothing to verify.
+    return Status::OK();
+  }
+  return verifier_.Run(this).status();
+}
+
+void VerifiedHloModule::VerifyOrAddFailure(const string& message) {
+  Status status = Verify();
+  if (!status.ok()) {
+    ADD_FAILURE() << "HloVerifier failed on module " << name()
+                  << (message.empty() ? "" : absl::StrCat(" (", message, ")"))
+                  << ": " << status;
+  }
+}
+
 HloTestBase::HloTestBase(bool verifier_layout_sensitive,
                          bool allow_mixed_precision_in_hlo_verifier,
                          std::function<bool(const HloInstruction*)>
@@ -100,17 +117,48 @@ HloTestBase::HloTestBase(se::Platform* test_platform,
                          bool allow_mixed_precision_in_hlo_verifier,
                          std::function<bool(const HloInstruction*)>
                              instruction_can_change_layout_func)
-    : test_runner_(test_platform), reference_runner_(reference_platform) {
+    : test_runner_(test_platform),
+      reference_runner_(reference_platform),
+      verifier_layout_sensitive_(verifier_layout_sensitive),
+      allow_mixed_precision_in_hlo_verifier_(
+          allow_mixed_precision_in_hlo_verifier) {
   hlo_verifier_ = absl::make_unique<HloVerifier>(
       /*layout_sensitive=*/verifier_layout_sensitive,
       /*allow_mixed_precision=*/allow_mixed_precision_in_hlo_verifier,
       instruction_can_change_layout_func);
 }
 
-std::unique_ptr<HloModule> HloTestBase::CreateNewModule(const string& name) {
+std::unique_ptr<HloModule> HloTestBase::CreateNewUnverifiedModule(
+    const string& name) {
   return absl::make_unique<HloModule>(name, GetModuleConfigForTest());
 }
 
+std::unique_ptr<VerifiedHloModule> HloTestBase::CreateNewVerifiedModule(
+    const string& name) {
+  return absl::make_unique<VerifiedHloModule>(
+      name, GetModuleConfigForTest(), verifier_layout_sensitive_,
+      allow_mixed_precision_in_hlo_verifier_);
+}
+
+StatusOr<std::unique_ptr<HloModule>>
+HloTestBase::ParseAndReturnUnverifiedModule(absl::string_view hlo_text,
+                                            const HloModuleConfig& config) {
+  auto module = absl::make_unique<HloModule>(TestName(), config);
+  TF_RETURN_IF_ERROR(ParseHloString(hlo_text, module.get()));
+  return std::move(module);
+}
+
+StatusOr<std::unique_ptr<VerifiedHloModule>>
+HloTestBase::ParseAndReturnVerifiedModule(absl::string_view hlo_text,
+                                          const HloModuleConfig& config) {
+  auto module = absl::make_unique<VerifiedHloModule>(
+      TestName(), config, verifier_layout_sensitive_,
+      allow_mixed_precision_in_hlo_verifier_);
+  TF_RETURN_IF_ERROR(ParseHloString(hlo_text, module.get()));
+  TF_RETURN_IF_ERROR(module->Verify());
+  return std::move(module);
+}
+
 /* static */
 StatusOr<bool> HloTestBase::RunHloPass(HloPassInterface* hlo_pass,
                                        HloModule* module) {
@@ -135,7 +183,7 @@ PrecisionConfig HloTestBase::DefaultPrecisionConfig(int operands) {
 }
 
 DebugOptions HloTestBase::GetDebugOptionsForTest() {
-  auto debug_options = legacy_flags::GetDebugOptionsFromFlags();
+  auto debug_options = GetDebugOptionsFromFlags();
   // TODO(b/38354253): Change tests to use Parameters instead of Constants.
   debug_options.add_xla_disable_hlo_passes("constant_folding");
   debug_options.set_xla_gpu_max_kernel_unroll_factor(1);
diff --git a/tensorflow/compiler/xla/tests/hlo_test_base.h b/tensorflow/compiler/xla/tests/hlo_test_base.h
index 217428befa4..366726d90b4 100644
--- a/tensorflow/compiler/xla/tests/hlo_test_base.h
+++ b/tensorflow/compiler/xla/tests/hlo_test_base.h
@@ -38,6 +38,31 @@ limitations under the License.
 
 namespace xla {
 
+// An HLO module derived class which verifies itself on destruction. This class
+// is intended to be used in unit tests. Any verification errors are raised via
+// ADD_FAILURE.
+class VerifiedHloModule : public HloModule {
+ public:
+  VerifiedHloModule(const string& name, const HloModuleConfig& config,
+                    bool verifier_layout_sensitive,
+                    bool allow_mixed_precision_in_hlo_verifier)
+      : HloModule(name, config),
+        verifier_(verifier_layout_sensitive,
+                  allow_mixed_precision_in_hlo_verifier) {}
+
+  ~VerifiedHloModule() override { VerifyOrAddFailure("in destructor"); }
+
+  // Verifies the module using HloVerifier and returns the status.
+  Status Verify();
+
+  // Verifies the module and flags any error with ADD_FAILURE. 'message' is
+  // included in the failure message.
+  void VerifyOrAddFailure(const string& message);
+
+ private:
+  HloVerifier verifier_;
+};
+
 // A base class for tests which build and/or run HLO code. The class includes
 // support for running an HLO module on two platforms and compare the results.
 // This is a lower level of abstraction than using the client interface and
@@ -72,7 +97,27 @@ class HloTestBase : public ::testing::Test {
   // options from command-line flags. If you want a fresh HloModule object and
   // then add HloComputations to it, it's recommended to use this method in your
   // tests.
-  std::unique_ptr<HloModule> CreateNewModule(const string& name = TestName());
+  //
+  // This returns a vanilla HloModule that doesn't run the HLO verifier on
+  // destruction.
+  std::unique_ptr<HloModule> CreateNewUnverifiedModule(
+      const string& name = TestName());
+
+  // Like CreateNewUnverifiedModule, except the HloModule returned here runs the
+  // HLO verifier on destruction.
+  std::unique_ptr<VerifiedHloModule> CreateNewVerifiedModule(
+      const string& name = TestName());
+
+  // Parses the given string and returns module as a vanilla, unverified
+  // HloModule.
+  StatusOr<std::unique_ptr<HloModule>> ParseAndReturnUnverifiedModule(
+      absl::string_view hlo_text,
+      const HloModuleConfig& config = HloModuleConfig());
+
+  // Parses the given string and returns module as a VerifiedHloModule.
+  StatusOr<std::unique_ptr<VerifiedHloModule>> ParseAndReturnVerifiedModule(
+      absl::string_view hlo_text,
+      const HloModuleConfig& config = HloModuleConfig());
 
   // Runs the hlo_pass with the provided module and returns the result. This
   // function also verifies that the module remains unchanged when hlo_pass
@@ -247,6 +292,8 @@ class HloTestBase : public ::testing::Test {
   HloRunner test_runner_;
   HloRunner reference_runner_;
 
+  bool verifier_layout_sensitive_;
+  bool allow_mixed_precision_in_hlo_verifier_;
   std::unique_ptr<HloVerifier> hlo_verifier_;
 
   ErrorSpec error_spec_{0.0001};
diff --git a/tensorflow/compiler/xla/tests/hlo_verified_test_base.cc b/tensorflow/compiler/xla/tests/hlo_verified_test_base.cc
deleted file mode 100644
index 8bd0a729b77..00000000000
--- a/tensorflow/compiler/xla/tests/hlo_verified_test_base.cc
+++ /dev/null
@@ -1,88 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
-
-#include "absl/memory/memory.h"
-#include "tensorflow/compiler/xla/service/hlo_parser.h"
-#include "tensorflow/compiler/xla/service/hlo_verifier.h"
-#include "tensorflow/compiler/xla/shape_util.h"
-#include "tensorflow/compiler/xla/status_macros.h"
-#include "tensorflow/core/platform/logging.h"
-
-namespace xla {
-
-Status VerifiedHloModule::Verify() {
-  if (computation_count() == 0) {
-    // The computation was never built. Nothing to verify.
-    return Status::OK();
-  }
-  return verifier_.Run(this).status();
-}
-
-void VerifiedHloModule::VerifyOrAddFailure(const string& message) {
-  Status status = Verify();
-  if (!status.ok()) {
-    ADD_FAILURE() << "HloVerifier failed on module " << name()
-                  << (message.empty() ? "" : absl::StrCat(" (", message, ")"))
-                  << ": " << status;
-  }
-}
-
-HloVerifiedTestBase::HloVerifiedTestBase(bool layout_sensitive,
-                                         bool allow_mixed_precision)
-    : HloTestBase(
-          /*verifier_layout_sensitive=*/layout_sensitive,
-          /*allow_mixed_precision_in_hlo_verifier=*/allow_mixed_precision),
-      verifier_layout_sensitive_(layout_sensitive),
-      allow_mixed_precision_in_hlo_verifier_(allow_mixed_precision) {}
-
-HloModule& HloVerifiedTestBase::module() {
-  if (!module_) {
-    module_ = CreateNewVerifiedModule(TestName());
-  }
-  return *module_;
-}
-
-HloModule* HloVerifiedTestBase::CreateNewModule(const string& name) {
-  modules_.emplace_back(CreateNewVerifiedModule(name));
-  return modules_.back().get();
-}
-
-void HloVerifiedTestBase::ParseAndVerifyModule(absl::string_view hlo_text,
-                                               const HloModuleConfig& config) {
-  CHECK(!module_) << "Called ParseModule when test already has a module.";
-  module_ = CreateNewVerifiedModule(TestName());
-  TF_CHECK_OK(ParseHloString(hlo_text, module_.get()));
-  module_->VerifyOrAddFailure("after parsing");
-}
-
-StatusOr<std::unique_ptr<VerifiedHloModule>>
-HloVerifiedTestBase::ParseAndReturnVerifiedModule(
-    absl::string_view hlo_text, const HloModuleConfig& config) {
-  auto module = CreateNewVerifiedModule(TestName());
-  TF_RETURN_IF_ERROR(ParseHloString(hlo_text, module.get()));
-  TF_RETURN_IF_ERROR(module->Verify());
-  return std::move(module);
-}
-
-std::unique_ptr<VerifiedHloModule> HloVerifiedTestBase::CreateNewVerifiedModule(
-    const string& name) {
-  return absl::make_unique<VerifiedHloModule>(
-      name, GetModuleConfigForTest(), verifier_layout_sensitive_,
-      allow_mixed_precision_in_hlo_verifier_);
-}
-
-}  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/hlo_verified_test_base.h b/tensorflow/compiler/xla/tests/hlo_verified_test_base.h
deleted file mode 100644
index 388a99bb364..00000000000
--- a/tensorflow/compiler/xla/tests/hlo_verified_test_base.h
+++ /dev/null
@@ -1,105 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_XLA_TESTS_HLO_VERIFIED_TEST_BASE_H_
-#define TENSORFLOW_COMPILER_XLA_TESTS_HLO_VERIFIED_TEST_BASE_H_
-
-#include <functional>
-#include <memory>
-#include <utility>
-
-#include "absl/base/macros.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
-#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
-
-namespace xla {
-
-// An HLO module derived class which verifies itself on destruction. This class
-// is intended to be used in unit tests. Any verification errors are raised via
-// ADD_FAILURE.
-class VerifiedHloModule : public HloModule {
- public:
-  VerifiedHloModule(const string& name, const HloModuleConfig& config,
-                    bool verifier_layout_sensitive,
-                    bool allow_mixed_precision_in_hlo_verifier)
-      : HloModule(name, config),
-        verifier_(verifier_layout_sensitive,
-                  allow_mixed_precision_in_hlo_verifier) {}
-
-  ~VerifiedHloModule() override { VerifyOrAddFailure("in destructor"); }
-
-  // Verifies the module using HloVerifier and returns the status.
-  Status Verify();
-
-  // Verifies the module and flags any error with ADD_FAILURE. 'message' is
-  // included in the failure message.
-  void VerifyOrAddFailure(const string& message);
-
- private:
-  HloVerifier verifier_;
-};
-
-// A base class for HLO tests that stores a default VerifiedHloModule.
-class HloVerifiedTestBase : public HloTestBase {
- protected:
-  HloVerifiedTestBase(bool layout_sensitive = false,
-                      bool allow_mixed_precision = false);
-
-  // Constructs a default shape verifier.
-  std::unique_ptr<ShapeVerifier> MakeShapeVerifier();
-
-  // Returns the default HloModule, lazily creating it if necessary via
-  // HloTestBase::CreateNewModule().
-  ABSL_DEPRECATED("Use CreateNewVerifiedModule() instead.")
-  HloModule& module();
-
-  ABSL_DEPRECATED("Use ParseAndReturnVerifiedModule() instead.")
-  void ParseAndVerifyModule(absl::string_view hlo_text,
-                            const HloModuleConfig& config = HloModuleConfig());
-
-  // Parses the given string and returns module as a VerifiedHloModule.
-  StatusOr<std::unique_ptr<VerifiedHloModule>> ParseAndReturnVerifiedModule(
-      absl::string_view hlo_text,
-      const HloModuleConfig& config = HloModuleConfig());
-
-  // Creates a new module for a test, and stores it in modules_ so it can be
-  // verified. Intentionally hides HloTestBase::CreateNewModule, to prevent
-  // creation of unverified modules.
-  ABSL_DEPRECATED("Use CreateNewVerifiedModule() instead.")
-  HloModule* CreateNewModule(const string& name = TestName());
-
-  // Creates and returns a verified HLO module with the given name.
-  std::unique_ptr<VerifiedHloModule> CreateNewVerifiedModule(
-      const string& name = TestName());
-
- private:
-  // It is confusing to store modules created by module() and CreateNewModule()
-  // in different fields, but it allows us to migrate tests to
-  // HloVerifiedTestBase more easily, so it's a win because we can verify more
-  // modules. See b/80488902.
-  //
-  // Lazily populated. Access via module().
-  std::unique_ptr<VerifiedHloModule> module_;
-
-  // Populated by calls to CreateNewModule.
-  std::vector<std::unique_ptr<VerifiedHloModule>> modules_;
-
-  bool verifier_layout_sensitive_;
-  bool allow_mixed_precision_in_hlo_verifier_;
-};
-
-}  // namespace xla
-
-#endif  // TENSORFLOW_COMPILER_XLA_TESTS_HLO_VERIFIED_TEST_BASE_H_
diff --git a/tensorflow/compiler/xla/tests/hlo_verified_test_base_test.cc b/tensorflow/compiler/xla/tests/hlo_verified_test_base_test.cc
deleted file mode 100644
index 5c0263e811f..00000000000
--- a/tensorflow/compiler/xla/tests/hlo_verified_test_base_test.cc
+++ /dev/null
@@ -1,158 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
-
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
-#include "tensorflow/compiler/xla/service/hlo_verifier.h"
-#include "tensorflow/compiler/xla/shape_util.h"
-#include "tensorflow/compiler/xla/tests/test_macros.h"
-#include "tensorflow/core/lib/core/status_test_util.h"
-#include "tensorflow/core/platform/test.h"
-#include "tensorflow/core/platform/types.h"
-
-namespace xla {
-namespace {
-
-// This class includes unit tests which are expected to fail because invalid HLO
-// modules are intentionally built. Unfortunately, Tensorflow doesn't appear to
-// include the necessary gunit parts to test this test machinery (needs the
-// macro EXPECT_NONFATAL_FAILURE). The disabled tests can be run with the
-// disabled tests enabled and failures can be manually compared against
-// expectations.
-class HloVerifiedTestBaseTest : public HloVerifiedTestBase {};
-
-XLA_TEST_F(HloVerifiedTestBaseTest, NoModule) {
-  // Test shouldn't fail if no module is created at all.
-}
-
-XLA_TEST_F(HloVerifiedTestBaseTest, GoodLazilyCreatedModule) {
-  // Use module() to lazily create an empty module, build it up, and verify no
-  // failures.
-  HloModule& hlo_module = module();
-  auto builder = HloComputation::Builder(TestName());
-  auto input = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(42.0)));
-  builder.AddInstruction(
-      HloInstruction::CreateUnary(input->shape(), HloOpcode::kNegate, input));
-  hlo_module.AddEntryComputation(builder.Build());
-}
-
-// This test is expected to fail. See test class comment.
-XLA_TEST_F(HloVerifiedTestBaseTest, DISABLED_BadLazilyCreatedModule) {
-  // Use module() to lazily create an empty module and build up an invalid
-  // module.
-  HloModule& hlo_module = module();
-  auto builder = HloComputation::Builder(TestName());
-  auto input = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(42.0)));
-  builder.AddInstruction(
-      HloInstruction::CreateUnary(input->shape(), HloOpcode::kNegate, input));
-  hlo_module.AddEntryComputation(builder.Build());
-
-  *hlo_module.entry_computation()->root_instruction()->mutable_shape() =
-      ShapeUtil::MakeShape(PRED, {1, 2, 3});
-}
-
-XLA_TEST_F(HloVerifiedTestBaseTest, GoodCreateNewModule) {
-  // Call CreateNewModule and build up a valid module.
-  HloModule* module = CreateNewModule();
-  auto builder = HloComputation::Builder(TestName());
-  auto input = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(42.0)));
-  builder.AddInstruction(
-      HloInstruction::CreateUnary(input->shape(), HloOpcode::kNegate, input));
-  module->AddEntryComputation(builder.Build());
-}
-
-// This test is expected to fail. See test class comment.
-XLA_TEST_F(HloVerifiedTestBaseTest, DISABLED_BadCreateNewModule) {
-  // Call CreateNewModule and build up a invalid module.
-  HloModule* module = CreateNewModule();
-  auto builder = HloComputation::Builder(TestName());
-  auto input = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(42.0)));
-  builder.AddInstruction(
-      HloInstruction::CreateUnary(input->shape(), HloOpcode::kNegate, input));
-  module->AddEntryComputation(builder.Build());
-
-  *module->entry_computation()->root_instruction()->mutable_shape() =
-      ShapeUtil::MakeShape(PRED, {1, 2, 3});
-}
-
-XLA_TEST_F(HloVerifiedTestBaseTest, ParseAndVerifyModuleGood) {
-  const char* const hlo_string = R"(
-HloModule ParseAndVerifyModuleGood
-
-ENTRY entry {
-  x = f32[] parameter(0)
-  y = f32[] parameter(1)
-  ROOT add = f32[] add(x,y)
-}
-)";
-
-  ParseAndVerifyModule(hlo_string);
-  EXPECT_EQ(module().entry_computation()->instruction_count(), 3);
-}
-
-XLA_TEST_F(HloVerifiedTestBaseTest, ParseAndReturnVerifiedModuleGood) {
-  const char* const hlo_string = R"(
-HloModule ParseAndReturnVerifiedModuleGood
-
-ENTRY entry {
-  x = f32[] parameter(0)
-  y = f32[] parameter(1)
-  ROOT add = f32[] add(x,y)
-}
-)";
-
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          ParseAndReturnVerifiedModule(hlo_string));
-  EXPECT_EQ(module->entry_computation()->instruction_count(), 3);
-}
-
-XLA_TEST_F(HloVerifiedTestBaseTest, ParseAndReturnVerifiedModuleInvalidText) {
-  const char* const hlo_string = R"(
-HloModule ParseAndReturnVerifiedModuleGood
-
-ENTRY entry {
-  x = f32[] parameter(0)
-  y = f32[] parameter(1)
-  ROOT add = f32[] add(x,y)
-}
-
-RANDOM GARBAGE
-)";
-
-  ASSERT_IS_NOT_OK(ParseAndReturnVerifiedModule(hlo_string).status());
-}
-
-// This test is expected to fail. See test class comment.
-XLA_TEST_F(HloVerifiedTestBaseTest, DISABLED_ParseAndReturnVerifiedModuleBad) {
-  const char* const hlo_string = R"(
-HloModule ParseAndReturnVerifiedModuleBad
-
-ENTRY entry {
-  x = f32[] parameter(0)
-  y = f32[] parameter(1)
-  ROOT add = f32[1234] add(x,y)
-}
-)";
-
-  ASSERT_IS_NOT_OK(ParseAndReturnVerifiedModule(hlo_string).status());
-}
-
-}  // namespace
-}  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/llvm_compiler_test.cc b/tensorflow/compiler/xla/tests/llvm_compiler_test.cc
index c622b295094..a78ccacec11 100644
--- a/tensorflow/compiler/xla/tests/llvm_compiler_test.cc
+++ b/tensorflow/compiler/xla/tests/llvm_compiler_test.cc
@@ -68,7 +68,7 @@ class LLVMCompilerTest : public ::testing::Test {
     builder.AddInstruction(
         HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(42.0)));
 
-    auto hlo_module = CreateNewModule();
+    auto hlo_module = CreateNewUnverifiedModule();
     hlo_module->AddEntryComputation(builder.Build());
 
     compiler->SetPreOptimizationHook(pre_opt_hook);
@@ -90,7 +90,7 @@ class LLVMCompilerTest : public ::testing::Test {
     builder.AddInstruction(
         HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(42.0)));
 
-    std::unique_ptr<HloModule> hlo_module = CreateNewModule();
+    std::unique_ptr<HloModule> hlo_module = CreateNewUnverifiedModule();
     hlo_module->AddEntryComputation(builder.Build());
 
     auto module_group = absl::make_unique<HloModuleGroup>("test_module_group");
@@ -124,9 +124,9 @@ class LLVMCompilerTest : public ::testing::Test {
     return ::testing::UnitTest::GetInstance()->current_test_info()->name();
   }
 
-  static std::unique_ptr<HloModule> CreateNewModule() {
+  static std::unique_ptr<HloModule> CreateNewUnverifiedModule() {
     HloModuleConfig config;
-    config.set_debug_options(legacy_flags::GetDebugOptionsFromFlags());
+    config.set_debug_options(GetDebugOptionsFromFlags());
     return absl::make_unique<HloModule>(TestName(), config);
   }
 };
diff --git a/tensorflow/compiler/xla/tests/multioutput_fusion_test.cc b/tensorflow/compiler/xla/tests/multioutput_fusion_test.cc
index ca7637a0cfa..3f5135438fc 100644
--- a/tensorflow/compiler/xla/tests/multioutput_fusion_test.cc
+++ b/tensorflow/compiler/xla/tests/multioutput_fusion_test.cc
@@ -62,7 +62,7 @@ class MultiOutputFusionTest : public HloTestBase {
 
   void RunTest2D(bool manual_fusion, int64 size) {
     auto builder = HloComputation::Builder(TestName());
-    auto hlo_module = CreateNewModule();
+    auto hlo_module = CreateNewUnverifiedModule();
 
     const Shape elem_shape0 = ShapeUtil::MakeShapeWithLayout(F32, {}, {});
     const Shape elem_shape2 =
@@ -122,7 +122,7 @@ class MultiOutputFusionTest : public HloTestBase {
 
   void RunTest1D(bool manual_fusion, int size) {
     auto builder = HloComputation::Builder(TestName());
-    auto hlo_module = CreateNewModule();
+    auto hlo_module = CreateNewUnverifiedModule();
 
     const Shape elem_shape_F32 =
         ShapeUtil::MakeShapeWithDescendingLayout(F32, {size});
diff --git a/tensorflow/compiler/xla/tests/slice_test.cc b/tensorflow/compiler/xla/tests/slice_test.cc
index 2cc33ab0963..3fb69419e73 100644
--- a/tensorflow/compiler/xla/tests/slice_test.cc
+++ b/tensorflow/compiler/xla/tests/slice_test.cc
@@ -166,6 +166,26 @@ TEST_F(SliceTest, SliceR4ThreeDimsMiddleMinor) {
   ComputeAndCompareR4(&builder, *expected, {}, ErrorSpec(0.000001));
 }
 
+TEST_F(SliceTest, SliceOfReshape) {
+  Array2D<int> values(2 * 3 * 24, 7);
+  values.FillIota(1);
+  XlaBuilder builder(TestName());
+  auto original = ConstantR2FromArray2D(&builder, values);
+  auto reshape = Reshape(original, {24, 3, 2, 7});
+  Slice(reshape, {0, 0, 0, 0}, {11, 3, 2, 7}, {1, 1, 1, 1});
+  ComputeAndCompare(&builder, {});
+}
+
+TEST_F(SliceTest, SliceOfCollapsingReshape) {
+  Array4D<int> values(2, 3, 5, 7);
+  values.FillIota(1);
+  XlaBuilder builder(TestName());
+  auto original = ConstantR4FromArray4D(&builder, values);
+  auto reshape = Reshape(original, {2 * 3 * 5, 7});
+  Slice(reshape, {0, 0}, {4, 7}, {1, 1});
+  ComputeAndCompare(&builder, {});
+}
+
 XLA_TEST_F(SliceTest, StridedSliceR4WithOutputLayout) {
   Array4D<float> values(2, 4, 6, 8);
   values.FillRandom(3.14f);
@@ -253,7 +273,6 @@ XLA_TEST_P(SliceR1LargeTest, DoIt_S64) { Run<int64>(GetParam()); }
 
 XLA_TEST_P(SliceR1Test, DoIt_PRED) { Run<bool>(GetParam()); }
 
-
 // Tests for R1 slice ops.
 // The format for each testcase is {input size, start, limit, stride}.
 // clang-format off
diff --git a/tensorflow/compiler/xla/tests/token_hlo_test.cc b/tensorflow/compiler/xla/tests/token_hlo_test.cc
index b34fd0f2e87..a2b7c26331b 100644
--- a/tensorflow/compiler/xla/tests/token_hlo_test.cc
+++ b/tensorflow/compiler/xla/tests/token_hlo_test.cc
@@ -28,7 +28,7 @@ namespace {
 class TokenHloTest : public HloTestBase {};
 
 XLA_TEST_F(TokenHloTest, SingleTokenInstruction) {
-  std::unique_ptr<HloModule> module = CreateNewModule();
+  std::unique_ptr<HloModule> module = CreateNewUnverifiedModule();
   auto builder = HloComputation::Builder(TestName());
   builder.AddInstruction(HloInstruction::CreateToken());
 
@@ -38,8 +38,22 @@ XLA_TEST_F(TokenHloTest, SingleTokenInstruction) {
   EXPECT_TRUE(LiteralTestUtil::Equal(result, LiteralUtil::CreateToken()));
 }
 
+XLA_TEST_F(TokenHloTest, TokenInTuple) {
+  std::unique_ptr<HloModule> module = CreateNewUnverifiedModule();
+  auto builder = HloComputation::Builder(TestName());
+  auto token = builder.AddInstruction(HloInstruction::CreateToken());
+  builder.AddInstruction(HloInstruction::CreateTuple({token}));
+
+  module->AddEntryComputation(builder.Build());
+
+  TF_ASSERT_OK_AND_ASSIGN(Literal result, Execute(std::move(module), {}));
+  Literal token_literal = LiteralUtil::CreateToken();
+  EXPECT_TRUE(
+      LiteralTestUtil::Equal(result, LiteralUtil::MakeTuple({&token_literal})));
+}
+
 XLA_TEST_F(TokenHloTest, TokenTree) {
-  std::unique_ptr<HloModule> module = CreateNewModule();
+  std::unique_ptr<HloModule> module = CreateNewUnverifiedModule();
   auto builder = HloComputation::Builder(TestName());
   auto token0 = builder.AddInstruction(HloInstruction::CreateToken());
   auto token1 = builder.AddInstruction(HloInstruction::CreateToken());
@@ -54,7 +68,7 @@ XLA_TEST_F(TokenHloTest, TokenTree) {
 }
 
 XLA_TEST_F(TokenHloTest, InvalidTokenShapedEntryParameter) {
-  std::unique_ptr<HloModule> module = CreateNewModule();
+  std::unique_ptr<HloModule> module = CreateNewUnverifiedModule();
   auto builder = HloComputation::Builder(TestName());
   builder.AddInstruction(
       HloInstruction::CreateParameter(0, ShapeUtil::MakeShape(F32, {}), "p0"));
@@ -75,7 +89,7 @@ XLA_TEST_F(TokenHloTest, InvalidTokenShapedEntryParameter) {
 }
 
 XLA_TEST_F(TokenHloTest, InvalidTupleTokenShapedEntryParameter) {
-  std::unique_ptr<HloModule> module = CreateNewModule();
+  std::unique_ptr<HloModule> module = CreateNewUnverifiedModule();
   auto builder = HloComputation::Builder(TestName());
   builder.AddInstruction(HloInstruction::CreateParameter(
       0,
@@ -95,7 +109,7 @@ XLA_TEST_F(TokenHloTest, InvalidTupleTokenShapedEntryParameter) {
 }
 
 XLA_TEST_F(TokenHloTest, InvalidOperandToTokenInstruction) {
-  std::unique_ptr<HloModule> module = CreateNewModule();
+  std::unique_ptr<HloModule> module = CreateNewUnverifiedModule();
   auto builder = HloComputation::Builder(TestName());
   auto param = builder.AddInstruction(
       HloInstruction::CreateParameter(0, ShapeUtil::MakeShape(F32, {}), "p0"));
diff --git a/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc b/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc
index 376559500ef..ca036f1ae0d 100644
--- a/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc
+++ b/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc
@@ -91,8 +91,8 @@ Status ParseOneProfileOutputLine(
   string match_usecs = "([0-9.]+) usec";
   string match_flops = "([^ ]*)";
   string match_trops = "([^ ]*)";
-  string match_bytes_per_sec = "([0-9.TGMKi]+)B/s";
-  string match_bytes_per_cycle = "([0-9.TGMKi]+)B/cycle";
+  string match_bytes_per_sec = "([0-9.TGMKi]*)(?:B/s)?";
+  string match_bytes_per_cycle = "([0-9.TGMKi]*)(?:B/cycle)?";
 
   // The underlined part is what we're trying to match with match_opcode:
   //
@@ -307,6 +307,7 @@ XLA_TEST_F(HloProfileTest, ProfileWhileComputation) {
   string profile_output;
   ExecuteAndFetchProfile(&profile_output, client, computation, matrix_shape,
                          matrix_shape);
+  SCOPED_TRACE(profile_output);
 
   std::vector<string> profile_output_lines =
       absl::StrSplit(profile_output, '\n');
@@ -318,14 +319,13 @@ XLA_TEST_F(HloProfileTest, ProfileWhileComputation) {
 
   ASSERT_NE(while_body_profile_start, profile_output_lines.cend());
 
-  auto while_body_profile_end = std::find_if(
-      while_body_profile_start, profile_output_lines.end(),
-      [](absl::string_view s) {
-        return absl::StartsWith(s, "********** microseconds report **********");
-      });
+  auto while_body_profile_end =
+      std::find_if(while_body_profile_start, profile_output_lines.end(),
+                   [](absl::string_view s) {
+                     return absl::StartsWith(s, "********** microseconds ");
+                   });
 
-  // We emit a blank line before the "********** microseconds report **********"
-  // line.
+  // We emit a blank line before the "microseconds report" line.
   while_body_profile_end--;
 
   ASSERT_NE(while_body_profile_end, profile_output_lines.end());
@@ -380,7 +380,7 @@ static std::pair<int, char**> AddXlaHloProfileFlag(int argc, char** argv) {
 
 GTEST_API_ int main(int argc, char** argv) {
   std::vector<tensorflow::Flag> flag_list;
-  xla::legacy_flags::AppendDebugOptionsFlags(&flag_list);
+  xla::AppendDebugOptionsFlags(&flag_list);
   std::tie(argc, argv) = AddXlaHloProfileFlag(argc, argv);
 
   auto usage = tensorflow::Flags::Usage(argv[0], flag_list);
diff --git a/tensorflow/compiler/xla/tests/xla_internal_test_main.cc b/tensorflow/compiler/xla/tests/xla_internal_test_main.cc
index 15603619b62..dca0aa52a53 100644
--- a/tensorflow/compiler/xla/tests/xla_internal_test_main.cc
+++ b/tensorflow/compiler/xla/tests/xla_internal_test_main.cc
@@ -15,14 +15,14 @@ limitations under the License.
 
 #include "absl/strings/match.h"
 #include "absl/strings/string_view.h"
-#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
+#include "tensorflow/compiler/xla/debug_options_flags.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
 
 GTEST_API_ int main(int argc, char** argv) {
   std::vector<tensorflow::Flag> flag_list;
-  xla::legacy_flags::AppendDebugOptionsFlags(&flag_list);
+  xla::AppendDebugOptionsFlags(&flag_list);
   auto usage = tensorflow::Flags::Usage(argv[0], flag_list);
   if (!tensorflow::Flags::Parse(&argc, argv, flag_list)) {
     LOG(ERROR) << "\n" << usage;
@@ -49,7 +49,7 @@ GTEST_API_ int main(int argc, char** argv) {
       // different API than Tensorflow's.
       testing::InitGoogleTest(&argc, argv);
 #if defined(PLATFORM_GOOGLE)
-      base::SetFlag(&FLAGS_benchmarks, pattern);
+      absl::SetFlag(&FLAGS_benchmarks, pattern);
       RunSpecifiedBenchmarks();
 #else
       tensorflow::testing::Benchmark::Run(pattern);
diff --git a/tensorflow/compiler/xla/tools/BUILD b/tensorflow/compiler/xla/tools/BUILD
index 3a086c66bbb..8926bbed2b5 100644
--- a/tensorflow/compiler/xla/tools/BUILD
+++ b/tensorflow/compiler/xla/tools/BUILD
@@ -33,6 +33,7 @@ cc_library(
     name = "dumped_computation_to_graphviz_library",
     srcs = ["dumped_computation_to_graphviz.cc"],
     deps = [
+        "//tensorflow/compiler/xla:debug_options_flags",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto",
@@ -40,7 +41,6 @@ cc_library(
         "//tensorflow/compiler/xla/client:client_library",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:xla_computation",
-        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/compiler/xla/service",
         "//tensorflow/compiler/xla/service:hlo_proto",
         "//tensorflow/core:lib",
@@ -78,6 +78,7 @@ cc_library(
     name = "replay_computation_library",
     srcs = ["replay_computation.cc"],
     deps = [
+        "//tensorflow/compiler/xla:debug_options_flags",
         "//tensorflow/compiler/xla:execution_options_util",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
@@ -91,7 +92,6 @@ cc_library(
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/client/lib:testing",
-        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/service:hlo_proto",
         "//tensorflow/compiler/xla/service/gpu:infeed_manager",
@@ -207,13 +207,13 @@ tf_cc_binary(
     name = "dumped_computation_to_tf_graphdef",
     srcs = ["dumped_computation_to_tf_graphdef.cc"],
     deps = [
+        "//tensorflow/compiler/xla:debug_options_flags",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla/client",
         "//tensorflow/compiler/xla/client:client_library",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:xla_computation",
-        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/compiler/xla/service",
         "//tensorflow/compiler/xla/service:hlo_graph_dumper",
         "//tensorflow/compiler/xla/service:hlo_proto",
diff --git a/tensorflow/compiler/xla/tools/dumped_computation_to_graphviz.cc b/tensorflow/compiler/xla/tools/dumped_computation_to_graphviz.cc
index c866a13de75..b623556468f 100644
--- a/tensorflow/compiler/xla/tools/dumped_computation_to_graphviz.cc
+++ b/tensorflow/compiler/xla/tools/dumped_computation_to_graphviz.cc
@@ -33,7 +33,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/client_library.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/client/xla_computation.h"
-#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
+#include "tensorflow/compiler/xla/debug_options_flags.h"
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/compiler/xla/service/service.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -54,7 +54,7 @@ void RealMain(absl::Span<char* const> args) {
         tensorflow::ReadBinaryProto(tensorflow::Env::Default(), arg, &module));
     XlaComputation computation =
         client->LoadSnapshot(module).ConsumeValueOrDie();
-    DebugOptions debug_options = legacy_flags::GetDebugOptionsFromFlags();
+    DebugOptions debug_options = GetDebugOptionsFromFlags();
     debug_options.set_xla_generate_hlo_graph(".*");
     ComputationStats stats =
         client->GetComputationStats(computation, debug_options)
@@ -68,7 +68,7 @@ void RealMain(absl::Span<char* const> args) {
 
 int main(int argc, char** argv) {
   std::vector<tensorflow::Flag> flag_list;
-  xla::legacy_flags::AppendDebugOptionsFlags(&flag_list);
+  xla::AppendDebugOptionsFlags(&flag_list);
   xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
   const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
   if (!parse_result) {
diff --git a/tensorflow/compiler/xla/tools/dumped_computation_to_tf_graphdef.cc b/tensorflow/compiler/xla/tools/dumped_computation_to_tf_graphdef.cc
index 07ef5ff656b..f8bb9a6b1e2 100644
--- a/tensorflow/compiler/xla/tools/dumped_computation_to_tf_graphdef.cc
+++ b/tensorflow/compiler/xla/tools/dumped_computation_to_tf_graphdef.cc
@@ -31,7 +31,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/client_library.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/client/xla_computation.h"
-#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
+#include "tensorflow/compiler/xla/debug_options_flags.h"
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/compiler/xla/service/service.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -53,7 +53,7 @@ void RealMain(absl::Span<char* const> args) {
         tensorflow::ReadBinaryProto(tensorflow::Env::Default(), arg, &module));
     XlaComputation computation =
         client->LoadSnapshot(module).ConsumeValueOrDie();
-    DebugOptions debug_options = legacy_flags::GetDebugOptionsFromFlags();
+    DebugOptions debug_options = GetDebugOptionsFromFlags();
     debug_options.set_xla_generate_hlo_graph(".*");
     debug_options.set_xla_hlo_dump_as_graphdef(true);
     ComputationStats stats =
@@ -68,7 +68,7 @@ void RealMain(absl::Span<char* const> args) {
 
 int main(int argc, char** argv) {
   std::vector<tensorflow::Flag> flag_list;
-  xla::legacy_flags::AppendDebugOptionsFlags(&flag_list);
+  xla::AppendDebugOptionsFlags(&flag_list);
   xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
   const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
   if (!parse_result) {
diff --git a/tensorflow/compiler/xla/tools/replay_computation.cc b/tensorflow/compiler/xla/tools/replay_computation.cc
index 109411f99b6..47be9f5adf1 100644
--- a/tensorflow/compiler/xla/tools/replay_computation.cc
+++ b/tensorflow/compiler/xla/tools/replay_computation.cc
@@ -47,8 +47,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/lib/testing.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/client/xla_computation.h"
+#include "tensorflow/compiler/xla/debug_options_flags.h"
 #include "tensorflow/compiler/xla/execution_options_util.h"
-#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/service/gpu/infeed_manager.h"
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
@@ -191,8 +191,7 @@ StatusOr<Literal> ReplayComputation(const HloSnapshot& module,
 
   // Run the computation num_runs times, and return the result from the last
   // execution.
-  const bool xla_hlo_profile =
-      legacy_flags::GetDebugOptionsFromFlags().xla_hlo_profile();
+  const bool xla_hlo_profile = GetDebugOptionsFromFlags().xla_hlo_profile();
   StreamExecutorMemoryAllocator allocator(
       client->platform(),
       {client->platform()->ExecutorForDevice(0).ValueOrDie()});
diff --git a/tensorflow/compiler/xla/xla.proto b/tensorflow/compiler/xla/xla.proto
index 65948ef4b0c..28df3b03f39 100644
--- a/tensorflow/compiler/xla/xla.proto
+++ b/tensorflow/compiler/xla/xla.proto
@@ -322,6 +322,34 @@ message UnregisterRequest {
 message UnregisterResponse {
 }
 
+message CompileRequest {
+  // The graph to be compiled.
+  HloModuleProto computation = 1;
+
+  // Options that affect how XLA compiles code to service this request.
+  ExecutionOptions execution_options = 2;
+
+  // The layouts of the input arguments. If not set, the default layout will be
+  // used. Although the real arguments are not needed in compilation, the
+  // layouts of the arguments can affect the compilation.
+  repeated Shape input_shape_with_layout = 3;
+}
+
+message CompileResponse {
+  // The handle to the executable.
+  ExecutionHandle handle = 1;
+}
+
+message ExecuteRequest {
+  ExecutionHandle handle = 1;
+
+  // The shape and layout of the arguments must be the same as the those of the
+  // executable's parameters.
+  repeated GlobalDataHandle arguments = 2;
+}
+
+// TODO(b/118493728): Remove this and ExecuteGraphParallelRequest and replace
+// the uses with calls to Compile and Execute.
 message ExecuteGraphRequest {
   HloModuleProto computation = 1;
   repeated GlobalDataHandle arguments = 2;
diff --git a/tensorflow/compiler/xla/xla_data.proto b/tensorflow/compiler/xla/xla_data.proto
index b6bd919e2b2..683ccc40f16 100644
--- a/tensorflow/compiler/xla/xla_data.proto
+++ b/tensorflow/compiler/xla/xla_data.proto
@@ -332,11 +332,13 @@ message LiteralProto {
   repeated double f64s = 9;
   repeated float c64s = 12;  // Stored as interleaved real, imag floats.
   repeated LiteralProto tuple_literals = 10;
-  // The F16s and BF16s are encoded in little endian byte order
+  // The F16s, BF16s, U16s and S16s are encoded in little endian byte order
   bytes f16s = 11;
   bytes bf16s = 13;
+  bytes u16s = 16;
+  bytes s16s = 17;
   repeated int64 sparse_indices = 14;
-  // Next = 16
+  // Next = 18
 }
 
 message WindowDimension {
diff --git a/tensorflow/compiler/xrt/kernels/BUILD b/tensorflow/compiler/xrt/kernels/BUILD
index 9e3d2454d16..67f475846e5 100644
--- a/tensorflow/compiler/xrt/kernels/BUILD
+++ b/tensorflow/compiler/xrt/kernels/BUILD
@@ -12,6 +12,7 @@ cc_library(
     hdrs = ["xrt_state_ops.h"],
     deps = [
         "//tensorflow/compiler/tf2xla:xla_compiler",
+        "//tensorflow/compiler/xla:debug_options_flags",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
@@ -21,7 +22,6 @@ cc_library(
         "//tensorflow/compiler/xla/client:compile_only_client",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:xla_computation",
-        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/compiler/xla/service:compiler",
         "//tensorflow/compiler/xla/service:computation_placer",
         "//tensorflow/compiler/xla/service:hlo_proto",
diff --git a/tensorflow/compiler/xrt/xrt.proto b/tensorflow/compiler/xrt/xrt.proto
index 5678f0905ff..6ab77fbaaf0 100644
--- a/tensorflow/compiler/xrt/xrt.proto
+++ b/tensorflow/compiler/xrt/xrt.proto
@@ -6,6 +6,24 @@ import "tensorflow/compiler/tf2xla/host_compute_metadata.proto";
 import "tensorflow/compiler/xla/xla_data.proto";
 import "tensorflow/compiler/xla/service/hlo.proto";
 
+message DeviceAssignment {
+  message ComputationDevice {
+    message DeviceMeshCoordinates {
+      // The mesh coordinates for the device. Usually (X, Y, Core), in the order
+      // in which they are returned in the TopologyProto.
+      //  X    = value(0)
+      //  Y    = value(1)
+      //  Core = value(2)
+      repeated int32 value = 1;
+    }
+    // As many replicas as there are in the replicated computation.
+    repeated DeviceMeshCoordinates replica_devices = 1;
+  }
+  // As many ComputationDevice as many there are computations (number
+  // of cores per replica).
+  repeated ComputationDevice computation_devices = 1;
+}
+
 // Options for an XLA compilation.
 message XLAComputationConfig {
   // The number of replicas the computation will be run on. If this is
@@ -23,6 +41,11 @@ message XLAComputationConfig {
   // computation. per_core_args_and_result_shapes is optional for a
   // single-core computation.
   repeated xla.ProgramShape per_core_program_shape = 5;
+  // Describes how replicated computation instances should be assigned to
+  // devices. There are num_cores_per_replica computations, and each one will be
+  // sent and executed to the set of replica device numbers described in the
+  // DeviceAssignment proto.
+  DeviceAssignment device_assignment = 6;
 }
 
 // Options and XLA computation for a compilation.
diff --git a/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler.py b/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler.py
index f45010ec26e..1fffbb5f660 100644
--- a/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler.py
+++ b/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler.py
@@ -142,7 +142,7 @@ class InequalitySplitHandler(base_split_handler.BaseSplitHandler):
         name="StatsAccumulator/{}".format(self._name))
     # Allocate both stats accumulator and quantile accumulator on the same
     # device so that we can build splits with fewer RPCs.
-    with ops.colocate_with(self._stats_accumulator.resource()):
+    with ops.colocate_with(self._stats_accumulator.resource_handle):
       self._quantile_accumulator = quantile_ops.QuantileAccumulator(
           init_stamp_token,
           epsilon=epsilon,
@@ -268,8 +268,8 @@ class DenseSplitHandler(InequalitySplitHandler):
       handler = make_dense_split_tensor
 
     are_splits_ready, partition_ids, gains, split_infos = (
-        handler(self._quantile_accumulator.resource(),
-                self._stats_accumulator.resource(), stamp_token,
+        handler(self._quantile_accumulator.resource_handle,
+                self._stats_accumulator.resource_handle, stamp_token,
                 next_stamp_token, self._multiclass_strategy, class_id,
                 self._feature_column_group_id, self._l1_regularization,
                 self._l2_regularization, self._tree_complexity_regularization,
@@ -447,8 +447,8 @@ class SparseSplitHandler(InequalitySplitHandler):
       handler = make_sparse_split_tensor
 
     are_splits_ready, partition_ids, gains, split_infos = (
-        handler(self._quantile_accumulator.resource(),
-                self._stats_accumulator.resource(), stamp_token,
+        handler(self._quantile_accumulator.resource_handle,
+                self._stats_accumulator.resource_handle, stamp_token,
                 next_stamp_token, self._multiclass_strategy, class_id,
                 self._feature_column_group_id, self._l1_regularization,
                 self._l2_regularization, self._tree_complexity_regularization,
diff --git a/tensorflow/contrib/boosted_trees/python/kernel_tests/stats_accumulator_ops_test.py b/tensorflow/contrib/boosted_trees/python/kernel_tests/stats_accumulator_ops_test.py
index 05ce0884ccf..356ae337685 100644
--- a/tensorflow/contrib/boosted_trees/python/kernel_tests/stats_accumulator_ops_test.py
+++ b/tensorflow/contrib/boosted_trees/python/kernel_tests/stats_accumulator_ops_test.py
@@ -34,7 +34,7 @@ class StatsAccumulatorScalarTest(test_util.TensorFlowTestCase):
           stamp_token=0,
           gradient_shape=tensor_shape.scalar(),
           hessian_shape=tensor_shape.scalar())
-      with ops.control_dependencies([accumulator._create_op]):
+      with ops.control_dependencies([accumulator.initializer]):
         op1 = accumulator.add(
             stamp_token=0,
             partition_ids=[1, 2],
@@ -62,7 +62,7 @@ class StatsAccumulatorScalarTest(test_util.TensorFlowTestCase):
           stamp_token=0,
           gradient_shape=tensor_shape.scalar(),
           hessian_shape=tensor_shape.scalar())
-      with ops.control_dependencies([accumulator._create_op]):
+      with ops.control_dependencies([accumulator.initializer]):
         op1 = accumulator.add(
             stamp_token=0,
             partition_ids=[1, 2, 1],
@@ -91,7 +91,7 @@ class StatsAccumulatorScalarTest(test_util.TensorFlowTestCase):
           stamp_token=0,
           gradient_shape=tensor_shape.scalar(),
           hessian_shape=tensor_shape.scalar())
-      with ops.control_dependencies([accumulator._create_op]):
+      with ops.control_dependencies([accumulator.initializer]):
         op1 = accumulator.add(
             stamp_token=0,
             partition_ids=[1, 2],
@@ -123,7 +123,7 @@ class StatsAccumulatorScalarTest(test_util.TensorFlowTestCase):
           stamp_token=0,
           gradient_shape=tensor_shape.scalar(),
           hessian_shape=tensor_shape.scalar())
-      with ops.control_dependencies([accumulator._create_op]):
+      with ops.control_dependencies([accumulator.initializer]):
         op1 = accumulator.add(
             stamp_token=0,
             partition_ids=[1, 2],
@@ -133,7 +133,7 @@ class StatsAccumulatorScalarTest(test_util.TensorFlowTestCase):
 
       with ops.control_dependencies([op1]):
         (stamp_token, num_updates, partition_1, feature_1, grads_1,
-         hessians_1) = accumulator.serialize()
+         hessians_1) = accumulator.saveable.serialize()
       # Make sure that the accumulator hasn't changed during serialization.
       with ops.control_dependencies([stamp_token]):
         num_updates_2, partition_2, feature_2, grads_2, hessians_2 = (
@@ -164,7 +164,7 @@ class StatsAccumulatorScalarTest(test_util.TensorFlowTestCase):
           stamp_token=0,
           gradient_shape=tensor_shape.scalar(),
           hessian_shape=tensor_shape.scalar())
-      with ops.control_dependencies([accumulator._create_op]):
+      with ops.control_dependencies([accumulator.initializer]):
         # These will be deleted due to deserialize call.
         op1 = accumulator.add(
             stamp_token=0,
@@ -175,7 +175,7 @@ class StatsAccumulatorScalarTest(test_util.TensorFlowTestCase):
 
       with ops.control_dependencies([op1]):
         deserialize = (
-            accumulator.deserialize(
+            accumulator.saveable.deserialize(
                 stamp_token=2,
                 num_updates=3,
                 partition_ids=[3, 4],
@@ -223,7 +223,7 @@ class StatsAccumulatorTensorTest(test_util.TensorFlowTestCase):
           stamp_token=0,
           gradient_shape=tensor_shape.TensorShape([2]),
           hessian_shape=tensor_shape.TensorShape([2, 2]))
-      with ops.control_dependencies([accumulator._create_op]):
+      with ops.control_dependencies([accumulator.initializer]):
         op1 = accumulator.add(
             stamp_token=0,
             partition_ids=[1, 2],
@@ -261,7 +261,7 @@ class StatsAccumulatorTensorTest(test_util.TensorFlowTestCase):
           stamp_token=0,
           gradient_shape=tensor_shape.TensorShape([2]),
           hessian_shape=tensor_shape.TensorShape([2, 2]))
-      with ops.control_dependencies([accumulator._create_op]):
+      with ops.control_dependencies([accumulator.initializer]):
         op1 = accumulator.add(
             stamp_token=0,
             partition_ids=[1, 2],
@@ -299,7 +299,7 @@ class StatsAccumulatorTensorTest(test_util.TensorFlowTestCase):
           stamp_token=0,
           gradient_shape=tensor_shape.TensorShape([2]),
           hessian_shape=tensor_shape.TensorShape([2, 2]))
-      with ops.control_dependencies([accumulator._create_op]):
+      with ops.control_dependencies([accumulator.initializer]):
         op1 = accumulator.add(
             stamp_token=0,
             partition_ids=[1, 2],
@@ -336,7 +336,7 @@ class StatsAccumulatorTensorTest(test_util.TensorFlowTestCase):
           stamp_token=0,
           gradient_shape=tensor_shape.TensorShape([2]),
           hessian_shape=tensor_shape.TensorShape([2, 2]))
-      with ops.control_dependencies([accumulator._create_op]):
+      with ops.control_dependencies([accumulator.initializer]):
         op1 = accumulator.add(
             stamp_token=0,
             partition_ids=[1, 2],
@@ -349,7 +349,7 @@ class StatsAccumulatorTensorTest(test_util.TensorFlowTestCase):
 
       with ops.control_dependencies([op1]):
         (stamp_token, num_updates_1, partition_1, feature_1, grads_1,
-         hessians_1) = accumulator.serialize()
+         hessians_1) = accumulator.saveable.serialize()
       # Make sure that the accumulator hasn't changed during serialization.
       with ops.control_dependencies([stamp_token]):
         num_updates_2, partition_2, feature_2, grads_2, hessians_2 = (
@@ -386,7 +386,7 @@ class StatsAccumulatorTensorTest(test_util.TensorFlowTestCase):
           stamp_token=0,
           gradient_shape=tensor_shape.TensorShape([2]),
           hessian_shape=tensor_shape.TensorShape([2, 2]))
-      with ops.control_dependencies([accumulator._create_op]):
+      with ops.control_dependencies([accumulator.initializer]):
         # These will be deleted due to deserialize call.
         op1 = accumulator.add(
             stamp_token=0,
@@ -399,7 +399,7 @@ class StatsAccumulatorTensorTest(test_util.TensorFlowTestCase):
                                                                     0.08]]])
 
       with ops.control_dependencies([op1]):
-        deserialize = accumulator.deserialize(
+        deserialize = accumulator.saveable.deserialize(
             stamp_token=2,
             num_updates=3,
             partition_ids=[3, 4],
diff --git a/tensorflow/contrib/boosted_trees/python/ops/model_ops.py b/tensorflow/contrib/boosted_trees/python/ops/model_ops.py
index 25b2c9e2fd7..fca22c71a83 100644
--- a/tensorflow/contrib/boosted_trees/python/ops/model_ops.py
+++ b/tensorflow/contrib/boosted_trees/python/ops/model_ops.py
@@ -17,6 +17,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import functools
+
 # pylint: disable=unused-import
 from tensorflow.contrib.boosted_trees.python.ops import boosted_trees_ops_loader
 # pylint: enable=unused-import
@@ -31,6 +33,7 @@ from tensorflow.contrib.boosted_trees.python.ops.gen_model_ops import tree_ensem
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import resources
 from tensorflow.python.training import saver
+from tensorflow.python.training.checkpointable import tracking
 
 ops.NotDifferentiable("TreeEnsembleVariable")
 ops.NotDifferentiable("TreeEnsembleSerialize")
@@ -82,6 +85,44 @@ class TreeEnsembleVariableSavable(saver.BaseSaverBuilder.SaveableObject):
           tree_ensemble_config=restored_tensors[1])
 
 
+class TreeEnsembleVariable(tracking.TrackableResource):
+  """A Tree ensemble model."""
+
+  def __init__(self, stamp_token, tree_ensemble_config, name, container=None):
+    self._stamp_token = stamp_token
+    self._tree_ensemble_config = tree_ensemble_config
+    self._name = name
+    self._container = container
+    self._init_op = None
+    super(TreeEnsembleVariable, self).__init__()
+
+  def create_resource(self):
+    return gen_model_ops.decision_tree_ensemble_resource_handle_op(
+        self._container, shared_name=self._name, name=self._name)
+
+  def initialize(self):
+    return gen_model_ops.create_tree_ensemble_variable(
+        self.resource_handle, self._stamp_token, self._tree_ensemble_config)
+
+  @property
+  def initializer(self):
+    if self._init_op is None:
+      self._init_op = self.initialize()
+    return self._init_op
+
+  def is_initialized(self):
+    return gen_model_ops.tree_ensemble_is_initialized_op(self.resource_handle)
+
+  def _gather_saveables_for_checkpoint(self):
+    return {
+        "tree_ensemble_variable":
+            functools.partial(
+                TreeEnsembleVariableSavable,
+                tree_ensemble_handle=self.resource_handle,
+                create_op=self.initializer)
+    }
+
+
 def tree_ensemble_variable(stamp_token,
                            tree_ensemble_config,
                            name,
@@ -99,12 +140,11 @@ def tree_ensemble_variable(stamp_token,
     A `Tensor` of type mutable `string`. The handle to the tree ensemble.
   """
   with ops.name_scope(name, "TreeEnsembleVariable") as name:
-    resource_handle = gen_model_ops.decision_tree_ensemble_resource_handle_op(
-        container, shared_name=name, name=name)
-    create_op = gen_model_ops.create_tree_ensemble_variable(
-        resource_handle, stamp_token, tree_ensemble_config)
-    is_initialized_op = gen_model_ops.tree_ensemble_is_initialized_op(
-        resource_handle)
+    tree_ensemble_var = TreeEnsembleVariable(stamp_token, tree_ensemble_config,
+                                             name, container)
+    resource_handle = tree_ensemble_var.resource_handle
+    create_op = tree_ensemble_var.initializer
+    is_initialized_op = tree_ensemble_var.is_initialized()
     # Adds the variable to the savable list.
     saveable = TreeEnsembleVariableSavable(resource_handle, create_op,
                                            resource_handle.name)
diff --git a/tensorflow/contrib/boosted_trees/python/ops/quantile_ops.py b/tensorflow/contrib/boosted_trees/python/ops/quantile_ops.py
index 19b6b3296db..0c319cc9bd1 100644
--- a/tensorflow/contrib/boosted_trees/python/ops/quantile_ops.py
+++ b/tensorflow/contrib/boosted_trees/python/ops/quantile_ops.py
@@ -33,12 +33,60 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import resources
 from tensorflow.python.training import saver
+from tensorflow.python.training.checkpointable import tracking
 
 # Pattern to remove all non alpha numeric from a string.
 _PATTERN = re.compile(r"[\W_]+")
 
 
-class QuantileAccumulator(saver.BaseSaverBuilder.SaveableObject):
+class QuantileAccumulatorSaveable(saver.BaseSaverBuilder.SaveableObject):
+  """SaveableObject implementation for QuantileAccumulator."""
+
+  def __init__(self, resource_handle, create_op, name):
+    self._resource_handle = resource_handle
+    self._create_op = create_op
+    stamp_token, state, are_buckets_ready, buckets = (
+        gen_quantile_ops.quantile_accumulator_serialize(resource_handle))
+    # slice_spec is useful for saving a slice from a variable.
+    # It's not meaningful in quantile accumulator.
+    slice_spec = ""
+    def make_save_spec(tensor, suffix):
+      return saver.BaseSaverBuilder.SaveSpec(tensor, slice_spec, name + suffix)
+
+    specs = [make_save_spec(stamp_token, "_stamp")]
+    specs += [make_save_spec(state, "_state")]
+    specs += [make_save_spec(are_buckets_ready, "_are_buckets_ready")]
+    specs += [make_save_spec(buckets, "buckets")]
+    super(QuantileAccumulatorSaveable, self).__init__(self._resource_handle,
+                                                      specs, name)
+
+  def restore(self, restored_tensors, unused_restored_shapes):
+    """Restores the associated quantile accumulator from 'restored_tensors'.
+
+    Args:
+      restored_tensors: the tensors that were loaded from a checkpoint.
+      unused_restored_shapes: the shapes this object should conform to after
+        restore.
+
+    Returns:
+      The operation that restores the state of the quantile accumulator.
+    """
+    # Read the restored tensors with the same order that were added to saving
+    # spec.
+    stamp_token = restored_tensors[:1]
+    state = restored_tensors[1:2]
+    are_buckets_ready = restored_tensors[2:3]
+    buckets = restored_tensors[3]
+    with ops.control_dependencies([self._create_op]):
+      return gen_quantile_ops.quantile_accumulator_deserialize(
+          self._resource_handle,
+          stamp_token=stamp_token,
+          stream_state=state,
+          are_buckets_ready=are_buckets_ready,
+          buckets=buckets)
+
+
+class QuantileAccumulator(tracking.TrackableResource):
   """A resource that allows distributed quantile computation."""
 
   def __init__(self,
@@ -61,82 +109,64 @@ class QuantileAccumulator(saver.BaseSaverBuilder.SaveableObject):
       generate_quantiles: Generate quantiles instead of approximate boundaries.
         If true, exactly `num_quantiles` will be produced in the final summary.
     """
+    self._init_stamp_token = init_stamp_token
     self._epsilon = epsilon
+    self._num_quantiles = num_quantiles
+    self._max_elements = max_elements
+    self._container = container
     self._generate_quantiles = generate_quantiles
+    super(QuantileAccumulator, self).__init__()
 
     name = _PATTERN.sub("", name)
     with ops.name_scope(name, "QuantileAccumulator") as name:
-      self._quantile_accumulator_handle = (
-          gen_quantile_ops.quantile_stream_resource_handle_op(
-              container=container, shared_name=name, name=name))
-      self._create_op = gen_quantile_ops.create_quantile_accumulator(
-          self._quantile_accumulator_handle,
-          init_stamp_token,
-          epsilon=epsilon,
-          max_elements=max_elements,
-          num_quantiles=num_quantiles,
-          generate_quantiles=generate_quantiles)
-      is_initialized_op = gen_quantile_ops.quantile_accumulator_is_initialized(
-          self._quantile_accumulator_handle)
-    resources.register_resource(self._quantile_accumulator_handle,
-                                self._create_op, is_initialized_op)
-    self._make_savable(name)
+      self._name = name
+      self._resource_handle = self.create_resource()
+      self._init_op = self.initialize()
+      is_initialized_op = self.is_initialized()
+    resources.register_resource(self.resource_handle, self._init_op,
+                                is_initialized_op)
+    self._saveable = QuantileAccumulatorSaveable(self.resource_handle,
+                                                 self._init_op, name)
+    ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, self._saveable)
 
-  def _make_savable(self, name):
-    stamp_token, state, are_buckets_ready, buckets = (
-        gen_quantile_ops.quantile_accumulator_serialize(
-            self._quantile_accumulator_handle))
-    # slice_spec is useful for saving a slice from a variable.
-    # It's not meaningful in quantile accumulator.
-    slice_spec = ""
-    def make_save_spec(tensor, suffix):
-      return saver.BaseSaverBuilder.SaveSpec(tensor, slice_spec, name + suffix)
+  def create_resource(self):
+    return gen_quantile_ops.quantile_stream_resource_handle_op(
+        container=self._container, shared_name=self._name, name=self._name)
 
-    specs = [make_save_spec(stamp_token, "_stamp")]
-    specs += [make_save_spec(state, "_state")]
-    specs += [make_save_spec(are_buckets_ready, "_are_buckets_ready")]
-    specs += [make_save_spec(buckets, "buckets")]
-    super(QuantileAccumulator,
-          self).__init__(self._quantile_accumulator_handle, specs, name)
-    ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, self)
+  def initialize(self):
+    return gen_quantile_ops.create_quantile_accumulator(
+        self.resource_handle,
+        self._init_stamp_token,
+        epsilon=self._epsilon,
+        max_elements=self._max_elements,
+        num_quantiles=self._num_quantiles,
+        generate_quantiles=self._generate_quantiles)
 
-  def restore(self, restored_tensors, unused_restored_shapes):
-    """Restores the associated quantile accumulator from 'restored_tensors'.
+  @property
+  def initializer(self):
+    if self._init_op is None:
+      self._init_op = self.initialize()
+    return self._init_op
 
-    Args:
-      restored_tensors: the tensors that were loaded from a checkpoint.
-      unused_restored_shapes: the shapes this object should conform to after
-        restore.
+  def is_initialized(self):
+    return gen_quantile_ops.quantile_accumulator_is_initialized(
+        self.resource_handle)
 
-    Returns:
-      The operation that restores the state of the quantile accumulator.
-    """
-    # Read the restored tensors with the same order that were added to saving
-    # spec.
-    stamp_token = restored_tensors[:1]
-    state = restored_tensors[1:2]
-    are_buckets_ready = restored_tensors[2:3]
-    buckets = restored_tensors[3]
-    with ops.control_dependencies([self._create_op]):
-      return gen_quantile_ops.quantile_accumulator_deserialize(
-          self._quantile_accumulator_handle,
-          stamp_token=stamp_token,
-          stream_state=state,
-          are_buckets_ready=are_buckets_ready,
-          buckets=buckets)
+  def _gather_saveables_for_checkpoint(self):
+    return {"quantile_accumulator", self.saveable}
 
   def get_buckets(self, stamp_token):
     """Returns quantile buckets created during previous flush."""
     are_buckets_ready, buckets = (
         gen_quantile_ops.quantile_accumulator_get_buckets(
-            quantile_accumulator_handles=[self._quantile_accumulator_handle],
+            quantile_accumulator_handles=[self.resource_handle],
             stamp_token=stamp_token))
     return are_buckets_ready[0], buckets[0]
 
   def schedule_get_buckets(self):
     """Returns a scheduled read of buckets created during previous flush."""
     return batch_ops_utils.ScheduledStampedResourceOp(
-        resource_handle=self._quantile_accumulator_handle,
+        resource_handle=self.resource_handle,
         op=gen_quantile_ops.quantile_accumulator_get_buckets)
 
   def _make_summary(self, column, example_weights):
@@ -161,14 +191,14 @@ class QuantileAccumulator(saver.BaseSaverBuilder.SaveableObject):
     """Adds quantile summary to its stream in resource."""
     summary = self._make_summary(column, example_weights)
     return gen_quantile_ops.quantile_accumulator_add_summaries(
-        quantile_accumulator_handles=[self._quantile_accumulator_handle],
+        quantile_accumulator_handles=[self.resource_handle],
         stamp_token=stamp_token,
         summaries=[summary])
 
   def add_prebuilt_summary(self, stamp_token, summary):
     """Adds quantile summary to its stream in resource."""
     return gen_quantile_ops.quantile_accumulator_add_summaries(
-        quantile_accumulator_handles=[self._quantile_accumulator_handle],
+        quantile_accumulator_handles=[self.resource_handle],
         stamp_token=stamp_token,
         summaries=[summary])
 
@@ -177,7 +207,7 @@ class QuantileAccumulator(saver.BaseSaverBuilder.SaveableObject):
     summary = self._make_summary(column, example_weights)
     return batch_ops_utils.ScheduledStampedResourceOp(
         op=gen_quantile_ops.quantile_accumulator_add_summaries,
-        resource_handle=self._quantile_accumulator_handle,
+        resource_handle=self.resource_handle,
         summaries=summary)
 
   def flush(self, stamp_token, next_stamp_token):
@@ -190,17 +220,14 @@ class QuantileAccumulator(saver.BaseSaverBuilder.SaveableObject):
       The flush operation.
     """
     return gen_quantile_ops.quantile_accumulator_flush(
-        quantile_accumulator_handle=self._quantile_accumulator_handle,
+        quantile_accumulator_handle=self.resource_handle,
         stamp_token=stamp_token,
         next_stamp_token=next_stamp_token)
 
   def flush_summary(self, stamp_token, next_stamp_token):
     """Finalizes quantile summary stream and resets it for next iteration."""
     result = gen_quantile_ops.quantile_accumulator_flush_summary(
-        quantile_accumulator_handle=self._quantile_accumulator_handle,
+        quantile_accumulator_handle=self.resource_handle,
         stamp_token=stamp_token,
         next_stamp_token=next_stamp_token)
     return result
-
-  def resource(self):
-    return self._quantile_accumulator_handle
diff --git a/tensorflow/contrib/boosted_trees/python/ops/stats_accumulator_ops.py b/tensorflow/contrib/boosted_trees/python/ops/stats_accumulator_ops.py
index 2e94e353f32..ad1191d4123 100644
--- a/tensorflow/contrib/boosted_trees/python/ops/stats_accumulator_ops.py
+++ b/tensorflow/contrib/boosted_trees/python/ops/stats_accumulator_ops.py
@@ -26,12 +26,83 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import resources
 from tensorflow.python.training import saver
+from tensorflow.python.training.checkpointable import tracking
 
 # Pattern to remove all non alpha numeric from a string.
 _PATTERN = re.compile(r"[\W_]+")
 
 
-class StatsAccumulator(saver.BaseSaverBuilder.SaveableObject):
+class StatsAccumulatorSaveable(saver.BaseSaverBuilder.SaveableObject):
+  """SaveableObject implementation for StatsAccumulator."""
+
+  def __init__(self, resource_handle, create_op, is_scalar, name):
+    self._create_op = create_op
+    self._resource_handle = resource_handle
+    self._is_scalar = is_scalar
+    slice_spec = ""
+    saver_name = self._resource_handle.name
+    (stamp_token, num_updates, partition_ids, feature_ids, gradients,
+     hessians) = self.serialize()
+    specs = [
+        saver.BaseSaverBuilder.SaveSpec(stamp_token, slice_spec,
+                                        saver_name + "_stamp"),
+        saver.BaseSaverBuilder.SaveSpec(num_updates, slice_spec,
+                                        saver_name + "_num_updates"),
+        saver.BaseSaverBuilder.SaveSpec(partition_ids, slice_spec,
+                                        saver_name + "_partition_ids"),
+        saver.BaseSaverBuilder.SaveSpec(feature_ids, slice_spec,
+                                        saver_name + "_feature_ids"),
+        saver.BaseSaverBuilder.SaveSpec(gradients, slice_spec,
+                                        saver_name + "_gradients"),
+        saver.BaseSaverBuilder.SaveSpec(hessians, slice_spec,
+                                        saver_name + "hessians"),
+    ]
+    super(StatsAccumulatorSaveable, self).__init__(self._resource_handle, specs,
+                                                   name)
+
+  def serialize(self):
+    """Serializes the stats accumulator state."""
+    if self._is_scalar:
+      return gen_stats_accumulator_ops.stats_accumulator_scalar_serialize(
+          self._resource_handle)
+    else:
+      return gen_stats_accumulator_ops.stats_accumulator_tensor_serialize(
+          self._resource_handle)
+
+  def deserialize(self, stamp_token, num_updates, partition_ids, feature_ids,
+                  gradients, hessians):
+    """Resets the stats accumulator with the serialized state."""
+    if self._is_scalar:
+      return gen_stats_accumulator_ops.stats_accumulator_scalar_deserialize(
+          self._resource_handle, stamp_token, num_updates, partition_ids,
+          feature_ids, gradients, hessians)
+    else:
+      return gen_stats_accumulator_ops.stats_accumulator_tensor_deserialize(
+          self._resource_handle, stamp_token, num_updates, partition_ids,
+          feature_ids, gradients, hessians)
+
+  def restore(self, restored_tensors, unused_restored_shapes):
+    """Restores the associated tree ensemble from 'restored_tensors'.
+
+    Args:
+      restored_tensors: the tensors that were loaded from a checkpoint.
+      unused_restored_shapes: the shapes this object should conform to after
+        restore. Not meaningful for trees.
+
+    Returns:
+      The operation that restores the state of the tree ensemble variable.
+    """
+    with ops.control_dependencies([self._create_op]):
+      return self.deserialize(
+          stamp_token=restored_tensors[0],
+          num_updates=restored_tensors[1],
+          partition_ids=restored_tensors[2],
+          feature_ids=restored_tensors[3],
+          gradients=restored_tensors[4],
+          hessians=restored_tensors[5])
+
+
+class StatsAccumulator(tracking.TrackableResource):
   """A resource that allows to accumulate gradients and hessians.
 
   For consistency guarantees, we use read and write stamp tokens.
@@ -58,58 +129,69 @@ class StatsAccumulator(saver.BaseSaverBuilder.SaveableObject):
     Returns:
       A `Tensor` of type mutable `string`. The handle to the stats accumulator.
     """
+    self._stamp_token = stamp_token
+    self._gradient_shape = gradient_shape
+    self._hessian_shape = hessian_shape
+    self._container = container
+
+    if (gradient_shape == tensor_shape.scalar() and
+        hessian_shape == tensor_shape.scalar()):
+      self._is_scalar = True
+    else:
+      self._is_scalar = False
+
     if name is not None:
       name = _PATTERN.sub("", name)
     with ops.name_scope(name, "StatsAccumulator") as name:
-      # Both values are scalars.
-      if (gradient_shape == tensor_shape.scalar() and
-          hessian_shape == tensor_shape.scalar()):
-        self._is_scalar = True
-        self._resource_handle = (gen_stats_accumulator_ops.
-                                 stats_accumulator_scalar_resource_handle_op(
-                                     container, name, name=name))
-
-        create_op = gen_stats_accumulator_ops.create_stats_accumulator_scalar(
-            self._resource_handle, stamp_token)
-        is_initialized_op = (
-            gen_stats_accumulator_ops.stats_accumulator_scalar_is_initialized(
-                self._resource_handle))
-      else:
-        self._is_scalar = False
-        self._resource_handle = (gen_stats_accumulator_ops.
-                                 stats_accumulator_tensor_resource_handle_op(
-                                     container, name, name=name))
-        create_op = gen_stats_accumulator_ops.create_stats_accumulator_tensor(
-            self._resource_handle, stamp_token, gradient_shape.as_list(),
-            hessian_shape.as_list())
-        is_initialized_op = (
-            gen_stats_accumulator_ops.stats_accumulator_tensor_is_initialized(
-                self._resource_handle))
-
-    self._create_op = create_op
-    slice_spec = ""
-    saver_name = self._resource_handle.name
-    (stamp_token, num_updates, partition_ids, feature_ids, gradients,
-     hessians) = self.serialize()
-    specs = [
-        saver.BaseSaverBuilder.SaveSpec(stamp_token, slice_spec,
-                                        saver_name + "_stamp"),
-        saver.BaseSaverBuilder.SaveSpec(num_updates, slice_spec,
-                                        saver_name + "_num_updates"),
-        saver.BaseSaverBuilder.SaveSpec(partition_ids, slice_spec,
-                                        saver_name + "_partition_ids"),
-        saver.BaseSaverBuilder.SaveSpec(feature_ids, slice_spec,
-                                        saver_name + "_feature_ids"),
-        saver.BaseSaverBuilder.SaveSpec(gradients, slice_spec,
-                                        saver_name + "_gradients"),
-        saver.BaseSaverBuilder.SaveSpec(hessians, slice_spec,
-                                        saver_name + "hessians"),
-    ]
-
-    super(StatsAccumulator, self).__init__(self._resource_handle, specs, name)
-    resources.register_resource(self._resource_handle, create_op,
+      self._name = name
+      self._resource_handle = self.create_resource()
+      self._init_op = self.initialize()
+      is_initialized_op = self.is_initialized()
+    resources.register_resource(self.resource_handle, self.initializer,
                                 is_initialized_op)
-    ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, self)
+    self._saveable = StatsAccumulatorSaveable(
+        self.resource_handle, self.initializer, self._is_scalar, name)
+    ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, self._saveable)
+
+  def create_resource(self):
+    if self._is_scalar:
+      return (
+          gen_stats_accumulator_ops.stats_accumulator_scalar_resource_handle_op(
+              self._container, self._name, name=self._name))
+    else:
+      return (
+          gen_stats_accumulator_ops.stats_accumulator_tensor_resource_handle_op(
+              self._container, self._name, name=self._name))
+
+  def initialize(self):
+    if self._is_scalar:
+      return gen_stats_accumulator_ops.create_stats_accumulator_scalar(
+          self.resource_handle, self._stamp_token)
+    else:
+      return gen_stats_accumulator_ops.create_stats_accumulator_tensor(
+          self.resource_handle, self._stamp_token,
+          self._gradient_shape.as_list(), self._hessian_shape.as_list())
+
+  @property
+  def initializer(self):
+    if self._init_op is None:
+      self._init_op = self.initialize()
+    return self._init_op
+
+  def is_initialized(self):
+    if self._is_scalar:
+      return gen_stats_accumulator_ops.stats_accumulator_scalar_is_initialized(
+          self.resource_handle)
+    else:
+      return gen_stats_accumulator_ops.stats_accumulator_tensor_is_initialized(
+          self.resource_handle)
+
+  @property
+  def saveable(self):
+    return self._saveable
+
+  def _gather_saveables_for_checkpoint(self):
+    return {"stats_accumulator", self.saveable}
 
   def add(self, stamp_token, partition_ids, feature_ids, gradients, hessians):
     """Updates the stats accumulator."""
@@ -117,11 +199,11 @@ class StatsAccumulator(saver.BaseSaverBuilder.SaveableObject):
         partition_ids, feature_ids, gradients, hessians))
     if self._is_scalar:
       return gen_stats_accumulator_ops.stats_accumulator_scalar_add(
-          [self._resource_handle], stamp_token, [partition_ids], [feature_ids],
+          [self.resource_handle], stamp_token, [partition_ids], [feature_ids],
           [gradients], [hessians])
     else:
       return gen_stats_accumulator_ops.stats_accumulator_tensor_add(
-          [self._resource_handle], stamp_token, [partition_ids], [feature_ids],
+          [self.resource_handle], stamp_token, [partition_ids], [feature_ids],
           [gradients], [hessians])
 
   def schedule_add(self, partition_ids, feature_ids, gradients, hessians):
@@ -131,7 +213,7 @@ class StatsAccumulator(saver.BaseSaverBuilder.SaveableObject):
     if self._is_scalar:
       return batch_ops_utils.ScheduledStampedResourceOp(
           op=gen_stats_accumulator_ops.stats_accumulator_scalar_add,
-          resource_handle=self._resource_handle,
+          resource_handle=self.resource_handle,
           partition_ids=partition_ids,
           feature_ids=feature_ids,
           gradients=gradients,
@@ -139,7 +221,7 @@ class StatsAccumulator(saver.BaseSaverBuilder.SaveableObject):
     else:
       return batch_ops_utils.ScheduledStampedResourceOp(
           op=gen_stats_accumulator_ops.stats_accumulator_tensor_add,
-          resource_handle=self._resource_handle,
+          resource_handle=self.resource_handle,
           partition_ids=partition_ids,
           feature_ids=feature_ids,
           gradients=gradients,
@@ -153,55 +235,11 @@ class StatsAccumulator(saver.BaseSaverBuilder.SaveableObject):
       return gen_stats_accumulator_ops.stats_accumulator_tensor_make_summary(
           partition_ids, feature_ids, gradients, hessians)
 
-  def deserialize(self, stamp_token, num_updates, partition_ids, feature_ids,
-                  gradients, hessians):
-    """Resets the stats accumulator with the serialized state."""
-    if self._is_scalar:
-      return gen_stats_accumulator_ops.stats_accumulator_scalar_deserialize(
-          self._resource_handle, stamp_token, num_updates, partition_ids,
-          feature_ids, gradients, hessians)
-    else:
-      return gen_stats_accumulator_ops.stats_accumulator_tensor_deserialize(
-          self._resource_handle, stamp_token, num_updates, partition_ids,
-          feature_ids, gradients, hessians)
-
   def flush(self, stamp_token, next_stamp_token):
     """Flushes the stats accumulator."""
     if self._is_scalar:
       return gen_stats_accumulator_ops.stats_accumulator_scalar_flush(
-          self._resource_handle, stamp_token, next_stamp_token)
+          self.resource_handle, stamp_token, next_stamp_token)
     else:
       return gen_stats_accumulator_ops.stats_accumulator_tensor_flush(
-          self._resource_handle, stamp_token, next_stamp_token)
-
-  def serialize(self):
-    """Serializes the stats accumulator state."""
-    if self._is_scalar:
-      return gen_stats_accumulator_ops.stats_accumulator_scalar_serialize(
-          self._resource_handle)
-    else:
-      return gen_stats_accumulator_ops.stats_accumulator_tensor_serialize(
-          self._resource_handle)
-
-  def restore(self, restored_tensors, unused_restored_shapes):
-    """Restores the associated tree ensemble from 'restored_tensors'.
-
-    Args:
-      restored_tensors: the tensors that were loaded from a checkpoint.
-      unused_restored_shapes: the shapes this object should conform to after
-        restore. Not meaningful for trees.
-
-    Returns:
-      The operation that restores the state of the tree ensemble variable.
-    """
-    with ops.control_dependencies([self._create_op]):
-      return self.deserialize(
-          stamp_token=restored_tensors[0],
-          num_updates=restored_tensors[1],
-          partition_ids=restored_tensors[2],
-          feature_ids=restored_tensors[3],
-          gradients=restored_tensors[4],
-          hessians=restored_tensors[5])
-
-  def resource(self):
-    return self._resource_handle
+          self.resource_handle, stamp_token, next_stamp_token)
diff --git a/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py b/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py
index 1cf61a10ba2..ab5713fbe26 100644
--- a/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py
+++ b/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py
@@ -992,7 +992,7 @@ class GradientBoostedDecisionTreeModel(object):
 
         # Get accumulated steps and examples for the current layer.
         _, _, _, _, acc_examples, acc_steps = (
-            steps_accumulator.serialize())
+            steps_accumulator.saveable.serialize())
         acc_examples = math_ops.cast(acc_examples[0], dtypes.int64)
         acc_steps = math_ops.cast(acc_steps[0], dtypes.int64)
         ensemble_update_ops.append(
diff --git a/tensorflow/contrib/cluster_resolver/python/training/cluster_resolver.py b/tensorflow/contrib/cluster_resolver/python/training/cluster_resolver.py
index 5ecd4f34183..40b1e667ee6 100644
--- a/tensorflow/contrib/cluster_resolver/python/training/cluster_resolver.py
+++ b/tensorflow/contrib/cluster_resolver/python/training/cluster_resolver.py
@@ -25,6 +25,13 @@ import six
 from tensorflow.python.training.server_lib import ClusterSpec
 
 
+def _format_master_url(master, rpc_layer=None):
+  if rpc_layer:
+    return '%s://%s' % (rpc_layer, master)
+  else:
+    return master
+
+
 @six.add_metaclass(abc.ABCMeta)
 class ClusterResolver(object):
   """Abstract class for all implementations of ClusterResolvers.
@@ -57,12 +64,13 @@ class ClusterResolver(object):
         'cluster_spec is not implemented for {}.'.format(self))
 
   @abc.abstractmethod
-  def master(self, task_type=None, task_index=None):
+  def master(self, task_type=None, task_index=None, rpc_layer=None):
     """Retrieves the name or URL of the session master.
 
     Args:
       task_type: (Optional) The type of the TensorFlow task of the master.
       task_index: (Optional) The index of the TensorFlow task of the master.
+      rpc_layer: (Optional) The RPC protocol for the given cluster.
 
     Returns:
       The name or URL of the session master.
@@ -77,10 +85,18 @@ class ClusterResolver(object):
 class SimpleClusterResolver(ClusterResolver):
   """Simple implementation of ClusterResolver that accepts a ClusterSpec."""
 
-  def __init__(self, cluster_spec, master=''):
+  def __init__(self, cluster_spec, master='', task_type=None, task_index=None,
+               environment='', num_accelerators_per_worker=0,
+               rpc_layer=None):
     """Creates a SimpleClusterResolver from a ClusterSpec."""
     super(SimpleClusterResolver, self).__init__()
 
+    self._task_type = task_type
+    self._task_index = task_index
+    self._environment = environment
+    self._num_accelerators_per_worker = num_accelerators_per_worker
+    self._rpc_layer = rpc_layer
+
     if not isinstance(cluster_spec, ClusterSpec):
       raise TypeError('cluster_spec must be a ClusterSpec.')
     self._cluster_spec = cluster_spec
@@ -93,12 +109,13 @@ class SimpleClusterResolver(ClusterResolver):
     """Returns the ClusterSpec passed into the constructor."""
     return self._cluster_spec
 
-  def master(self, task_type=None, task_index=None):
+  def master(self, task_type=None, task_index=None, rpc_layer=None):
     """Returns the master address to use when creating a session.
 
     Args:
       task_type: (Optional) The type of the TensorFlow task of the master.
       task_index: (Optional) The index of the TensorFlow task of the master.
+      rpc_layer: (Optional) The RPC used by distributed TensorFlow.
 
     Returns:
       The name or URL of the session master.
@@ -106,10 +123,52 @@ class SimpleClusterResolver(ClusterResolver):
     If a task_type and task_index is given, this will override the `master`
     string passed into the initialization function.
     """
-    if task_type and task_index:
-      return self.cluster_spec().task_address(task_type, task_index)
+    if task_type is not None and task_index is not None:
+      master = self.cluster_spec().task_address(task_type, task_index)
+    else:
+      master = self._master
 
-    return self._master
+    return _format_master_url(master, rpc_layer or self._rpc_layer)
+
+  @property
+  def task_type(self):
+    return self._task_type
+
+  @property
+  def task_index(self):
+    return self._task_index
+
+  @task_type.setter
+  def task_type(self, task_type):
+    self._task_type = task_type
+
+  @task_index.setter
+  def task_index(self, task_index):
+    self._task_index = task_index
+
+  @property
+  def environment(self):
+    return self._environment
+
+  def num_accelerators_per_worker(self, session_config=None):
+    """Returns the number of accelerator cores per worker.
+
+    Args:
+      session_config: Unused. The SimpleClusterResolver does not do automatic
+        detection of accelerators, so a TensorFlow session will never be
+        created, and thus a `session_config` is never necessary here, and will
+        be ignored.
+    """
+    del session_config
+    return self._num_accelerators_per_worker
+
+  @property
+  def rpc_layer(self):
+    return self._rpc_layer
+
+  @rpc_layer.setter
+  def rpc_layer(self, rpc_layer):
+    self._rpc_layer = rpc_layer
 
 
 class UnionClusterResolver(ClusterResolver):
@@ -119,13 +178,22 @@ class UnionClusterResolver(ClusterResolver):
   merges the underlying ClusterResolvers, and returns one unified ClusterSpec
   when cluster_spec is called. The details of the merge function is
   documented in the cluster_spec function.
+
+  For additional Cluster Resolver properties such as task type, task index,
+  rpc layer, environment, etc..., we will return the value from the first
+  ClusterResolver in the union.
   """
 
-  def __init__(self, *args):
+  def __init__(self, *args, **kwargs):
     """Initializes a UnionClusterResolver with other ClusterResolvers.
 
     Args:
       *args: `ClusterResolver` objects to be unionized.
+      **kwargs:
+        rpc_layer - (Optional) Override value for the RPC layer used by
+          TensorFlow.
+        task_type - (Optional) Override value for the current task type.
+        task_index - (Optional) Override value for the current task index.
 
     Raises:
       TypeError: If any argument is not a subclass of `ClusterResolvers`.
@@ -133,6 +201,13 @@ class UnionClusterResolver(ClusterResolver):
     """
     super(UnionClusterResolver, self).__init__()
 
+    self._rpc_layer = kwargs.pop('rpc_layer', None)
+    self._task_type = kwargs.pop('task_type', None)
+    self._task_index = kwargs.pop('task_index', None)
+
+    if kwargs:
+      raise ValueError('Unexpected kwargs provided {!r}'.format(kwargs))
+
     if not args:
       raise ValueError('At least one ClusterResolver is required.')
 
@@ -216,7 +291,7 @@ class UnionClusterResolver(ClusterResolver):
 
     return ClusterSpec(merged_cluster)
 
-  def master(self, task_type=None, task_index=None):
+  def master(self, task_type=None, task_index=None, rpc_layer=None):
     """Returns the master address to use when creating a session.
 
     This usually returns the master from the first ClusterResolver passed in,
@@ -225,11 +300,45 @@ class UnionClusterResolver(ClusterResolver):
     Args:
       task_type: (Optional) The type of the TensorFlow task of the master.
       task_index: (Optional) The index of the TensorFlow task of the master.
+      rpc_layer: (Optional) The RPC protocol for the given cluster.
 
     Returns:
       The name or URL of the session master.
     """
-    if task_type and task_index:
-      return self.cluster_spec().task_address(task_type, task_index)
+    if task_type is not None and task_index is not None:
+      master = self.cluster_spec().task_address(task_type, task_index)
+      return _format_master_url(master, rpc_layer or self._rpc_layer)
 
-    return self._cluster_resolvers[0].master()
+    return self._cluster_resolvers[0].master(rpc_layer=rpc_layer)
+
+  @property
+  def task_type(self):
+    return self._task_type or self._cluster_resolvers[0].task_type
+
+  @property
+  def task_index(self):
+    return self._task_index or self._cluster_resolvers[0].task_index
+
+  @task_type.setter
+  def task_type(self, task_type):
+    self._task_type = task_type
+
+  @task_index.setter
+  def task_index(self, task_index):
+    self._task_index = task_index
+
+  @property
+  def environment(self):
+    return self._cluster_resolvers[0].environment
+
+  def num_accelerators_per_worker(self, session_config=None):
+    return self._cluster_resolvers[0].num_accelerators_per_worker(
+        session_config)
+
+  @property
+  def rpc_layer(self):
+    return self._rpc_layer or self._cluster_resolvers[0].rpc_layer
+
+  @rpc_layer.setter
+  def rpc_layer(self, rpc_layer):
+    self._rpc_layer = rpc_layer
diff --git a/tensorflow/contrib/cluster_resolver/python/training/cluster_resolver_test.py b/tensorflow/contrib/cluster_resolver/python/training/cluster_resolver_test.py
index c004b2e2d3b..b94c9612b5b 100644
--- a/tensorflow/contrib/cluster_resolver/python/training/cluster_resolver_test.py
+++ b/tensorflow/contrib/cluster_resolver/python/training/cluster_resolver_test.py
@@ -57,6 +57,62 @@ class UnionClusterResolverTest(test.TestCase):
     actual_cluster_spec = union_resolver.cluster_spec()
     self._verifyClusterSpecEquality(actual_cluster_spec, expected_proto)
 
+  def testInitSimpleClusterResolver(self):
+    base_cluster_spec = server_lib.ClusterSpec({
+        "ps": ["ps0:2222", "ps1:2222"],
+        "worker": ["worker0:2222", "worker1:2222", "worker2:2222"]
+    })
+
+    simple_resolver = SimpleClusterResolver(base_cluster_spec, task_type="ps",
+                                            task_index=1, environment="cloud",
+                                            num_accelerators_per_worker=8,
+                                            rpc_layer="grpc")
+
+    self.assertEqual(simple_resolver.task_type, "ps")
+    self.assertEqual(simple_resolver.task_index, 1)
+    self.assertEqual(simple_resolver.environment, "cloud")
+    self.assertEqual(simple_resolver.num_accelerators_per_worker(), 8)
+    self.assertEqual(simple_resolver.rpc_layer, "grpc")
+
+  def testOverrideSimpleClusterResolver(self):
+    base_cluster_spec = server_lib.ClusterSpec({
+        "ps": ["ps0:2222", "ps1:2222"],
+        "worker": ["worker0:2222", "worker1:2222", "worker2:2222"]
+    })
+
+    simple_resolver = SimpleClusterResolver(base_cluster_spec, task_type="ps",
+                                            task_index=1, environment="cloud",
+                                            num_accelerators_per_worker=8,
+                                            rpc_layer="grpc")
+
+    simple_resolver.task_type = "worker"
+    simple_resolver.task_index = 2
+    simple_resolver.rpc_layer = "http"
+
+    self.assertEqual(simple_resolver.task_type, "worker")
+    self.assertEqual(simple_resolver.task_index, 2)
+    self.assertEqual(simple_resolver.rpc_layer, "http")
+
+  def testSimpleOverrideMasterWithTaskIndexZero(self):
+    base_cluster_spec = server_lib.ClusterSpec({
+        "ps": ["ps0:2222", "ps1:2222"],
+        "worker": ["worker0:2222", "worker1:2222", "worker2:2222"]
+    })
+
+    simple_resolver = SimpleClusterResolver(base_cluster_spec)
+    actual_master = simple_resolver.master("worker", 0, rpc_layer="grpc")
+    self.assertEqual(actual_master, "grpc://worker0:2222")
+
+  def testSimpleOverrideMasterWithRpcLayer(self):
+    base_cluster_spec = server_lib.ClusterSpec({
+        "ps": ["ps0:2222", "ps1:2222"],
+        "worker": ["worker0:2222", "worker1:2222", "worker2:2222"]
+    })
+
+    simple_resolver = SimpleClusterResolver(base_cluster_spec)
+    actual_master = simple_resolver.master("worker", 2, rpc_layer="grpc")
+    self.assertEqual(actual_master, "grpc://worker2:2222")
+
   def testSimpleOverrideMaster(self):
     base_cluster_spec = server_lib.ClusterSpec({
         "ps": ["ps0:2222", "ps1:2222"],
@@ -65,7 +121,42 @@ class UnionClusterResolverTest(test.TestCase):
 
     simple_resolver = SimpleClusterResolver(base_cluster_spec)
     actual_master = simple_resolver.master("worker", 2)
-    self.assertEquals(actual_master, "worker2:2222")
+    self.assertEqual(actual_master, "worker2:2222")
+
+  def testUnionClusterResolverGetProperties(self):
+    cluster_spec_1 = server_lib.ClusterSpec({
+        "ps": ["ps0:2222", "ps1:2222"],
+        "worker": ["worker0:2222", "worker1:2222", "worker2:2222"]
+    })
+    resolver1 = SimpleClusterResolver(cluster_spec_1, task_type="ps",
+                                      task_index=1, environment="cloud",
+                                      num_accelerators_per_worker=8,
+                                      rpc_layer="grpc")
+
+    cluster_spec_2 = server_lib.ClusterSpec({
+        "ps": ["ps2:2222", "ps3:2222"],
+        "worker": ["worker3:2222", "worker4:2222", "worker5:2222"]
+    })
+    resolver2 = SimpleClusterResolver(cluster_spec_2, task_type="worker",
+                                      task_index=2, environment="local",
+                                      num_accelerators_per_worker=16,
+                                      rpc_layer="http")
+
+    union_resolver = UnionClusterResolver(resolver1, resolver2)
+
+    self.assertEqual(union_resolver.task_type, "ps")
+    self.assertEqual(union_resolver.task_index, 1)
+    self.assertEqual(union_resolver.environment, "cloud")
+    self.assertEqual(union_resolver.num_accelerators_per_worker(), 8)
+    self.assertEqual(union_resolver.rpc_layer, "grpc")
+
+    union_resolver.task_type = "worker"
+    union_resolver.task_index = 2
+    union_resolver.rpc_layer = "http"
+
+    self.assertEqual(union_resolver.task_type, "worker")
+    self.assertEqual(union_resolver.task_index, 2)
+    self.assertEqual(union_resolver.rpc_layer, "http")
 
   def testTwoNonOverlappingJobMergedClusterResolver(self):
     cluster_spec_1 = server_lib.ClusterSpec({
@@ -116,10 +207,13 @@ class UnionClusterResolverTest(test.TestCase):
     union_cluster = UnionClusterResolver(cluster_resolver_1, cluster_resolver_2)
 
     unspecified_master = union_cluster.master()
-    self.assertEquals(unspecified_master, "")
+    self.assertEqual(unspecified_master, "")
 
     specified_master = union_cluster.master("worker", 1)
-    self.assertEquals(specified_master, "worker1:2222")
+    self.assertEqual(specified_master, "worker1:2222")
+
+    rpc_master = union_cluster.master("worker", 1, rpc_layer="grpc")
+    self.assertEqual(rpc_master, "grpc://worker1:2222")
 
   def testOverlappingJobMergedClusterResolver(self):
     cluster_spec_1 = server_lib.ClusterSpec({
diff --git a/tensorflow/contrib/cluster_resolver/python/training/gce_cluster_resolver.py b/tensorflow/contrib/cluster_resolver/python/training/gce_cluster_resolver.py
index 5083e4d10ba..195b68959b6 100644
--- a/tensorflow/contrib/cluster_resolver/python/training/gce_cluster_resolver.py
+++ b/tensorflow/contrib/cluster_resolver/python/training/gce_cluster_resolver.py
@@ -30,6 +30,10 @@ except ImportError:
   _GOOGLE_API_CLIENT_INSTALLED = False
 
 
+def _format_master_url(master, rpc_layer=None):
+  return '%s://%s' % (rpc_layer, master) if rpc_layer else master
+
+
 class GceClusterResolver(ClusterResolver):
   """Cluster Resolver for Google Compute Engine.
 
@@ -45,7 +49,10 @@ class GceClusterResolver(ClusterResolver):
                zone,
                instance_group,
                port,
-               job_name='worker',
+               task_type='worker',
+               task_index=0,
+               rpc_layer='grpc',
+               num_accelerators_per_worker=0,
                credentials='default',
                service=None):
     """Creates a new GceClusterResolver object.
@@ -55,13 +62,22 @@ class GceClusterResolver(ClusterResolver):
     each instance in the instance group.
 
     Args:
-      project: Name of the GCE project
-      zone: Zone of the GCE instance group
-      instance_group: Name of the GCE instance group
+      project: Name of the GCE project.
+      zone: Zone of the GCE instance group.
+      instance_group: Name of the GCE instance group.
       port: Port of the listening TensorFlow server (default: 8470)
-      job_name: Name of the TensorFlow job this set of instances belongs to
+      task_type: Name of the TensorFlow job this GCE instance group of VM
+        instances belong to.
+      task_index: The task index for this particular VM, within the GCE
+        instance group. In particular, every single instance should be assigned
+        a unique ordinal index within an instance group manually so that they
+        can be distinguished from each other.
+      rpc_layer: The RPC layer TensorFlow should use to communicate across
+        instances.
+      num_accelerators_per_worker: Number of accelerators (GPUs) present per
+        instance.
       credentials: GCE Credentials. If nothing is specified, this defaults to
-        GoogleCredentials.get_application_default()
+        GoogleCredentials.get_application_default().
       service: The GCE API object returned by the googleapiclient.discovery
         function. (Default: discovery.build('compute', 'v1')). If you specify a
         custom service object, then the credentials parameter will be ignored.
@@ -72,7 +88,9 @@ class GceClusterResolver(ClusterResolver):
     self._project = project
     self._zone = zone
     self._instance_group = instance_group
-    self._job_name = job_name
+    self._task_type = task_type
+    self._task_index = task_index
+    self._rpc_layer = rpc_layer
     self._port = port
     self._credentials = credentials
 
@@ -133,10 +151,58 @@ class GceClusterResolver(ClusterResolver):
           previous_response=response)
 
     worker_list.sort()
-    return ClusterSpec({self._job_name: worker_list})
+    return ClusterSpec({self._task_type: worker_list})
 
-  def master(self, task_type=None, task_index=None):
-    if task_type and task_index:
-      return self.cluster_spec().task_address(task_type, task_index)
+  def master(self, task_type=None, task_index=None, rpc_layer=None):
+    task_type = task_type if task_type is not None else self._task_type
+    task_index = task_index if task_index is not None else self._task_index
+
+    if task_type is not None and task_index is not None:
+      master = self.cluster_spec().task_address(task_type, task_index)
+      if rpc_layer or self._rpc_layer:
+        return '%s://%s' % (rpc_layer or self._rpc_layer, master)
+      else:
+        return master
 
     return ''
+
+  @property
+  def task_type(self):
+    return self._task_type
+
+  @property
+  def task_index(self):
+    return self._task_index
+
+  @task_type.setter
+  def task_type(self, task_type):
+    raise RuntimeError(
+        'You cannot reset the task_type of the GceClusterResolver after it has '
+        'been created.')
+
+  @task_index.setter
+  def task_index(self, task_index):
+    self._task_index = task_index
+
+  @property
+  def environment(self):
+    """Returns the current environment which TensorFlow is running in.
+
+    For users in the GCE environment, the environment property is always an
+    empty string, and Google users will not use this ClusterResolver for running
+    on internal systems.
+    """
+    return ''
+
+  @property
+  def rpc_layer(self):
+    return self._rpc_layer
+
+  @rpc_layer.setter
+  def rpc_layer(self, rpc_layer):
+    self._rpc_layer = rpc_layer
+
+  def num_accelerators_per_worker(self, session_config=None):
+    del session_config  # Unused, since this is set manually in __init__.
+    return self._num_accelerators_per_worker
+
diff --git a/tensorflow/contrib/cluster_resolver/python/training/gce_cluster_resolver_test.py b/tensorflow/contrib/cluster_resolver/python/training/gce_cluster_resolver_test.py
index 87b83031224..c691552e860 100644
--- a/tensorflow/contrib/cluster_resolver/python/training/gce_cluster_resolver_test.py
+++ b/tensorflow/contrib/cluster_resolver/python/training/gce_cluster_resolver_test.py
@@ -135,12 +135,86 @@ class GceClusterResolverTest(test.TestCase):
     """
     self._verifyClusterSpecEquality(actual_cluster_spec, expected_proto)
 
+  def testMasterRetrieval(self):
+    gce_cluster_resolver = GceClusterResolver(
+        project='test-project',
+        zone='us-east1-d',
+        instance_group='test-instance-group',
+        task_index=0,
+        port=8470,
+        credentials=None,
+        service=self.standard_mock_service_client())
+    self.assertEqual(gce_cluster_resolver.master(), 'grpc://10.123.45.67:8470')
+
+  def testMasterRetrievalWithCustomTasks(self):
+    name_to_ip = [
+        {'name': 'instance1', 'ip': '10.1.2.3'},
+        {'name': 'instance2', 'ip': '10.2.3.4'},
+        {'name': 'instance3', 'ip': '10.3.4.5'},
+    ]
+
+    gce_cluster_resolver = GceClusterResolver(
+        project='test-project',
+        zone='us-east1-d',
+        instance_group='test-instance-group',
+        port=8470,
+        credentials=None,
+        service=self.gen_standard_mock_service_client(name_to_ip))
+
+    self.assertEqual(
+        gce_cluster_resolver.master('worker', 2, 'test'),
+        'test://10.3.4.5:8470')
+
+  def testOverrideParameters(self):
+    name_to_ip = [
+        {'name': 'instance1', 'ip': '10.1.2.3'},
+        {'name': 'instance2', 'ip': '10.2.3.4'},
+        {'name': 'instance3', 'ip': '10.3.4.5'},
+    ]
+
+    gce_cluster_resolver = GceClusterResolver(
+        project='test-project',
+        zone='us-east1-d',
+        instance_group='test-instance-group',
+        task_type='testworker',
+        port=8470,
+        credentials=None,
+        service=self.gen_standard_mock_service_client(name_to_ip))
+
+    gce_cluster_resolver.task_index = 1
+    gce_cluster_resolver.rpc_layer = 'test'
+
+    self.assertEqual(gce_cluster_resolver.task_type, 'testworker')
+    self.assertEqual(gce_cluster_resolver.task_index, 1)
+    self.assertEqual(gce_cluster_resolver.rpc_layer, 'test')
+    self.assertEqual(gce_cluster_resolver.master(), 'test://10.2.3.4:8470')
+
+  def testOverrideParametersWithZeroOrEmpty(self):
+    name_to_ip = [
+        {'name': 'instance1', 'ip': '10.1.2.3'},
+        {'name': 'instance2', 'ip': '10.2.3.4'},
+        {'name': 'instance3', 'ip': '10.3.4.5'},
+    ]
+
+    gce_cluster_resolver = GceClusterResolver(
+        project='test-project',
+        zone='us-east1-d',
+        instance_group='test-instance-group',
+        task_type='',
+        task_index=1,
+        port=8470,
+        credentials=None,
+        service=self.gen_standard_mock_service_client(name_to_ip))
+
+    self.assertEqual(gce_cluster_resolver.master(
+        task_type='', task_index=0), 'grpc://10.1.2.3:8470')
+
   def testCustomJobNameAndPortRetrieval(self):
     gce_cluster_resolver = GceClusterResolver(
         project='test-project',
         zone='us-east1-d',
         instance_group='test-instance-group',
-        job_name='custom',
+        task_type='custom',
         port=2222,
         credentials=None,
         service=self.standard_mock_service_client())
@@ -196,7 +270,7 @@ class GceClusterResolverTest(test.TestCase):
         project='test-project',
         zone='us-east1-d',
         instance_group='test-instance-group',
-        job_name='worker',
+        task_type='worker',
         port=8470,
         credentials=None,
         service=self.gen_standard_mock_service_client(worker1_name_to_ip))
@@ -205,7 +279,7 @@ class GceClusterResolverTest(test.TestCase):
         project='test-project',
         zone='us-east1-d',
         instance_group='test-instance-group',
-        job_name='worker',
+        task_type='worker',
         port=8470,
         credentials=None,
         service=self.gen_standard_mock_service_client(worker2_name_to_ip))
@@ -214,7 +288,7 @@ class GceClusterResolverTest(test.TestCase):
         project='test-project',
         zone='us-east1-d',
         instance_group='test-instance-group',
-        job_name='ps',
+        task_type='ps',
         port=2222,
         credentials=None,
         service=self.gen_standard_mock_service_client(ps_name_to_ip))
diff --git a/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver.py b/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver.py
index c4ac9d07001..1f6803a9ff9 100644
--- a/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver.py
+++ b/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver.py
@@ -50,6 +50,34 @@ class TPUClusterResolver(ClusterResolver):
   Cloud Platform project.
   """
 
+  def _tpuService(self):
+    """Creates a new Cloud TPU API object.
+
+    This works around an issue where the underlying HTTP connection sometimes
+    times out when the script has been running for too long. Other methods in
+    this object calls this method to get a new API object whenever they need
+    to communicate with the Cloud API.
+
+    Returns:
+      A Google Cloud TPU API object.
+    """
+    if self._service:
+      return self._service
+
+    credentials = self._credentials
+    if credentials is None or credentials == 'default':
+      credentials = GoogleCredentials.get_application_default()
+
+    if self._discovery_url:
+      return discovery.build(
+          'tpu', 'v1alpha1',
+          credentials=credentials,
+          discoveryServiceUrl=self._discovery_url)
+    else:
+      return discovery.build(
+          'tpu', 'v1alpha1',
+          credentials=credentials)
+
   def _requestComputeMetadata(self, path):
     req = Request('http://metadata/computeMetadata/v1/%s' % path,
                   headers={'Metadata-Flavor': 'Google'})
@@ -81,7 +109,7 @@ class TPUClusterResolver(ClusterResolver):
     return None
 
   @staticmethod
-  def _discoveryUrl():
+  def _environmentDiscoveryUrl():
     return os.environ.get(_DISCOVERY_SERVICE_URL_ENV_VARIABLE)
 
   def __init__(self,
@@ -154,49 +182,42 @@ class TPUClusterResolver(ClusterResolver):
 
     self._tpu = compat.as_bytes(tpu)  # self._tpu is always bytes
     self._job_name = job_name
-    self._credentials = credentials
 
+    # Whether we should actually attempt to contact Cloud APIs
     should_resolve = self._shouldResolve()
 
+    # We error out if we are in a non-Cloud environment which cannot talk to the
+    # Cloud APIs using the standard class and a special object is not passed in.
+    self._service = service
+    if (self._service is None and should_resolve and
+        not _GOOGLE_API_CLIENT_INSTALLED):
+      raise ImportError('googleapiclient and oauth2client must be installed '
+                        'before using the TPU cluster resolver. Execute: '
+                        '`pip install --upgrade google-api-python-client` '
+                        'and `pip install --upgrade oauth2client` to '
+                        'install with pip.')
+
+    # We save user-passed credentials, unless the user didn't pass in anything.
+    self._credentials = credentials
+    if (credentials == 'default' and should_resolve and
+        _GOOGLE_API_CLIENT_INSTALLED):
+      self._credentials = None
+
+    # Automatically detect project and zone if unspecified.
     if not project and should_resolve:
       project = compat.as_str(
           self._requestComputeMetadata('project/project-id'))
-
     if not zone and should_resolve:
       zone_path = compat.as_str(self._requestComputeMetadata('instance/zone'))
       zone = zone_path.split('/')[-1]
-
     self._project = project
     self._zone = zone
 
-    if credentials == 'default' and should_resolve:
-      if _GOOGLE_API_CLIENT_INSTALLED:
-        self._credentials = GoogleCredentials.get_application_default()
-
-    if service is None and should_resolve:
-      if not _GOOGLE_API_CLIENT_INSTALLED:
-        raise ImportError('googleapiclient and oauth2client must be installed '
-                          'before using the TPU cluster resolver. Execute: '
-                          '`pip install --upgrade google-api-python-client` '
-                          'and `pip install --upgrade oauth2client` to '
-                          'install with pip.')
-
-      final_discovery_url = self._discoveryUrl() or discovery_url
-      if final_discovery_url:
-        self._service = discovery.build(
-            'tpu', 'v1alpha1',
-            credentials=self._credentials,
-            discoveryServiceUrl=final_discovery_url)
-      else:
-        self._service = discovery.build(
-            'tpu', 'v1alpha1',
-            credentials=self._credentials)
-    else:
-      self._service = service
+    self._discovery_url = self._environmentDiscoveryUrl() or discovery_url
 
     self._coordinator_name = coordinator_name
-    if coordinator_name and not coordinator_address and (should_resolve or
-                                                         in_gke):
+    if (coordinator_name and not coordinator_address and
+        (should_resolve or in_gke)):
       self._start_local_server()
     else:
       self._coordinator_address = coordinator_address
@@ -270,7 +291,8 @@ class TPUClusterResolver(ClusterResolver):
       # Case 1.
       full_name = 'projects/%s/locations/%s/nodes/%s' % (
           self._project, self._zone, compat.as_text(self._tpu))
-      request = self._service.projects().locations().nodes().get(name=full_name)
+      service = self._tpuService()
+      request = service.projects().locations().nodes().get(name=full_name)
       response = request.execute()
 
       if 'state' in response and response['state'] != 'READY':
diff --git a/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver_test.py b/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver_test.py
index ad4f6432630..478c82967ba 100644
--- a/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver_test.py
+++ b/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver_test.py
@@ -459,10 +459,10 @@ class TPUClusterResolverTest(test.TestCase):
 
     del os.environ['KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS']
 
-  def testDiscoveryUrl(self):
+  def testEnvironmentDiscoveryUrl(self):
     os.environ['TPU_API_DISCOVERY_URL'] = 'https://{api}.internal/{apiVersion}'
     self.assertEqual('https://{api}.internal/{apiVersion}',
-                     TPUClusterResolver._discoveryUrl())
+                     TPUClusterResolver._environmentDiscoveryUrl())
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/cmake/CMakeLists.txt b/tensorflow/contrib/cmake/CMakeLists.txt
index fbdca497fcc..a63366e1361 100644
--- a/tensorflow/contrib/cmake/CMakeLists.txt
+++ b/tensorflow/contrib/cmake/CMakeLists.txt
@@ -59,8 +59,6 @@ option(tensorflow_ENABLE_MKLDNN_SUPPORT "Enable Intel MKLDNN support, requires M
 
 # GPU, CUDA and cuDNN options
 option(tensorflow_ENABLE_GPU "Enable GPU support" OFF)
-set(tensorflow_CUDA_VERSION "9.0" CACHE STRING "CUDA version to build against")
-set(tensorflow_CUDNN_VERSION "7" CACHE STRING "cuDNN version to build against")
 
 if(HAIKU)
 	option(tensorflow_ENABLE_POSITION_INDEPENDENT_CODE "Enable PIE support" OFF)
@@ -72,25 +70,25 @@ endif()
 if (NOT WIN32)
   # Threads: defines CMAKE_THREAD_LIBS_INIT and adds -pthread compile option
   # for targets that link ${CMAKE_THREAD_LIBS_INIT}.
-  find_package (Threads)
+  find_package (Threads REQUIRED)
 
   # Options for linking CUDA/CUDNN libraries
-  option(tensorflow_PATH_STATIC_LIB "Additional library search path for libcudnn_static.a, libnccl_static.a, libculibos.a" /usr/local/cuda/lib64/)
+  option(tensorflow_PATH_CUDA_LIB "Additional library search path for cudnn, nccl, culibos" /usr/local/cuda/lib64/)
   option(tensorflow_CUDNN_INCLUDE "cudnn.h header install path" /usr/include/)
   if (NOT tensorflow_CUDNN_INCLUDE)
     # option's default value is OFF. Fill it with real default values
     set(tensorflow_CUDNN_INCLUDE /usr/include)
   endif (NOT tensorflow_CUDNN_INCLUDE)
-  option(tensorflow_PATH_CUDNN_STATIC_LIB "Override PATH_STATIC_LIB for libcudnn_static.a" ${tensorflow_PATH_STATIC_LIB})
-  if (NOT tensorflow_PATH_CUDNN_STATIC_LIB)
+  option(tensorflow_PATH_CUDNN_LIB "Override PATH_CUDA_LIB for cudnn" ${tensorflow_PATH_CUDA_LIB})
+  if (NOT tensorflow_PATH_CUDNN_LIB)
     # option's default value is OFF. Fill it with real default values
-    set (tensorflow_PATH_CUDNN_STATIC_LIB ${tensorflow_PATH_STATIC_LIB})
-  endif (NOT tensorflow_PATH_CUDNN_STATIC_LIB)
-  option(tensorflow_PATH_NCCL_STATIC_LIB "Override PATH_STATIC_LIB for libnccl_static.a" ${tensorflow_PATH_STATIC_LIB})
-  if (NOT tensorflow_PATH_NCCL_STATIC_LIB)
+    set (tensorflow_PATH_CUDNN_LIB ${tensorflow_PATH_CUDA_LIB})
+  endif (NOT tensorflow_PATH_CUDNN_LIB)
+  option(tensorflow_PATH_NCCL_LIB "Override PATH_CUDA_LIB for nccl" ${tensorflow_PATH_CUDA_LIB})
+  if (NOT tensorflow_PATH_NCCL_LIB)
     # option's default value is OFF. Fill it with real default values
-    set (tensorflow_PATH_NCCL_STATIC_LIB ${tensorflow_PATH_STATIC_LIB})
-  endif (NOT tensorflow_PATH_NCCL_STATIC_LIB)
+    set (tensorflow_PATH_NCCL_LIB ${tensorflow_PATH_CUDA_LIB})
+  endif (NOT tensorflow_PATH_NCCL_LIB)
   option(tensorflow_CUDA_LIBRARY_PATH "Designate the default CUDA library paths" /usr/local/cuda/lib64)
   if (NOT tensorflow_CUDA_LIBRARY_PATH)
     # option's default value is OFF. Fill it with real default values
@@ -210,14 +208,17 @@ endif()
 include(CheckCXXCompilerFlag)
 
 # OpenMP Support
-CHECK_CXX_COMPILER_FLAG("-fopenmp" GCC_OPENMP_SUPPORT)
-if (GCC_OPENMP_SUPPORT)
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp")
-endif()
-CHECK_CXX_COMPILER_FLAG("/openmp" MSVC_OPENMP_SUPPORT)
-if (MSVC_OPENMP_SUPPORT)
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /openmp")
-endif()
+if (WIN32)
+  CHECK_CXX_COMPILER_FLAG("/openmp" MSVC_OPENMP_SUPPORT)
+  if (MSVC_OPENMP_SUPPORT)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /openmp")
+  endif()
+else (WIN32)
+  CHECK_CXX_COMPILER_FLAG("-fopenmp" GCC_OPENMP_SUPPORT)
+  if (GCC_OPENMP_SUPPORT)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp")
+  endif()
+endif (WIN32)
 
 # MSVC SIMD instructions
 if (tensorflow_WIN_CPU_SIMD_OPTIONS)
@@ -377,29 +378,19 @@ if (tensorflow_ENABLE_GPU)
     list(APPEND CMAKE_LIBRARY_PATH "${tensorflow_CUDA_LIBRARY_PATH}/stubs")
   endif (NOT WIN32)
 
-  # later command will make use of the value in tensorflow_CUDA_VERSION
-  find_package(CUDA ${tensorflow_CUDA_VERSION} REQUIRED EXACT)
-
-  # Test compatibility of compiler on CUDA
-  try_compile(CUDA_TEST_COMPILE_C
-    ${CMAKE_CURRENT_BINARY_DIR}/tests/cuda
-    ${CMAKE_CURRENT_SOURCE_DIR}/tests/cuda/compatibility_test.c
-    CMAKE_FLAGS -DINCLUDE_DIRECTORIES=${CUDA_INCLUDE_DIRS})
-  try_compile(CUDA_TEST_COMPILE_CXX
-    ${CMAKE_CURRENT_BINARY_DIR}/tests/cuda
-    ${CMAKE_CURRENT_SOURCE_DIR}/tests/cuda/compatibility_test.cc
-    CMAKE_FLAGS -DINCLUDE_DIRECTORIES=${CUDA_INCLUDE_DIRS})
-  if(NOT (CUDA_TEST_COMPILE_C AND CUDA_TEST_COMPILE_CXX))
-    message(FATAL_ERROR "Selected compiler (or version) is not supported for CUDA")
+  # minimum 9.1 in cuda version
+  find_package(CUDA 9.1 REQUIRED)
+  if(NOT CUDA_FOUND)
+    message(FATAL_ERROR "CUDA not found.")
   endif()
 
-  # by default we assume compute cabability 3.5 and 5.2. If you change this change it in
-  # CUDA_NVCC_FLAGS and cuda_config.h below
-  set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-gencode arch=compute_37,code=\"sm_37,compute_37\")
-  set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-gencode arch=compute_52,code=\"sm_52,compute_52\")
-  set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-gencode arch=compute_60,code=\"sm_60,compute_60\")
-  set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-gencode arch=compute_61,code=\"sm_61,compute_61\")
-  set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-gencode arch=compute_70,code=\"sm_70,compute_70\")
+  # use cmake internal CUDA_ARCH_NAME switch
+  # e.g. CUDA_ARCH_NAME="Auto" will autodetect
+  #      CUDA_ARCH_NAME="All"  will use all arches
+  cuda_select_nvcc_arch_flags(NVCC_ARCH_FLAGS ${CUDA_ARCH_NAME})
+  list(APPEND CUDA_NVCC_FLAGS ${NVCC_ARCH_FLAGS})
+  message(STATUS "Using CUDA arch flags: ${NVCC_ARCH_FLAGS_readable}")
+
   set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};--include-path ${PROJECT_BINARY_DIR}/$\{build_configuration\};--expt-relaxed-constexpr)
   set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-ftz=true)  # Flush denormals to zero
   set(CUDA_INCLUDE ${CUDA_TOOLKIT_TARGET_DIR} ${CUDA_TOOLKIT_TARGET_DIR}/extras/CUPTI/include)
@@ -423,43 +414,94 @@ if (tensorflow_ENABLE_GPU)
   else (WIN32)
     set(CUDNN_INCLUDE "${tensorflow_CUDNN_INCLUDE}")
 
-    find_library(nccl_STATIC_LIBRARY NAMES libnccl_static.a PATHS ${tensorflow_PATH_NCCL_STATIC_LIB} ${CUDA_TOOLKIT_ROOT_DIR})
-    if (NOT nccl_STATIC_LIBRARY)
+    if (tensorflow_BUILD_SHARED_LIB)
+      find_library(nccl_LIBRARY NAMES libnccl.so PATHS ${tensorflow_PATH_NCCL_LIB} ${CUDA_TOOLKIT_ROOT_DIR})
+    else (tensorflow_BUILD_SHARED_LIB)
+      find_library(nccl_LIBRARY NAMES libnccl_static.a PATHS ${tensorflow_PATH_NCCL_LIB} ${CUDA_TOOLKIT_ROOT_DIR})
+    endif (tensorflow_BUILD_SHARED_LIB)
+    if (NOT nccl_LIBRARY)
       message(FATAL_ERROR "NCCL is required for GPU-build")
-    else (NOT nccl_STATIC_LIBRARY)
-      message("nccl-static: ${nccl_STATIC_LIBRARY}")
+    else (NOT nccl_LIBRARY)
+      message("nccl: ${nccl_LIBRARY}")
       # something like /usr/lib64/libnccl_static.a
-    endif (NOT nccl_STATIC_LIBRARY)
+    endif (NOT nccl_LIBRARY)
 
-    find_library(cudnn_STATIC_LIBRARY NAMES libcudnn_static.a PATHS ${tensorflow_PATH_CUDNN_STATIC_LIB} ${CUDA_TOOLKIT_ROOT_DIR})
-    if (NOT cudnn_STATIC_LIBRARY)
+    if (tensorflow_BUILD_SHARED_LIB)
+      find_library(cudnn_LIBRARY NAMES libcudnn.so PATHS ${tensorflow_PATH_CUDNN_LIB} ${CUDA_TOOLKIT_ROOT_DIR})
+    else (tensorflow_BUILD_SHARED_LIB)
+      find_library(cudnn_LIBRARY NAMES libcudnn_static.a PATHS ${tensorflow_PATH_CUDNN_LIB} ${CUDA_TOOLKIT_ROOT_DIR})
+    endif (tensorflow_BUILD_SHARED_LIB)
+    if (NOT cudnn_LIBRARY)
       message(FATAL_ERROR "CUDNN is required for GPU-build")
-    else (NOT cudnn_STATIC_LIBRARY)
-      message("cudnn-static: ${cudnn_STATIC_LIBRARY}")
-    endif (NOT cudnn_STATIC_LIBRARY)
+    else (NOT cudnn_LIBRARY)
+      file(READ ${CUDNN_INCLUDE}/cudnn.h CUDNN_VERSION_FILE_CONTENTS)
+      # fetch cudnn version
+      string(REGEX MATCH "define CUDNN_MAJOR * +([0-9]+)"
+             CUDNN_VERSION_MAJOR "${CUDNN_VERSION_FILE_CONTENTS}")
+      string(REGEX REPLACE "define CUDNN_MAJOR * +([0-9]+)" "\\1"
+             CUDNN_VERSION_MAJOR "${CUDNN_VERSION_MAJOR}")
+      string(REGEX MATCH "define CUDNN_MINOR * +([0-9]+)"
+             CUDNN_VERSION_MINOR "${CUDNN_VERSION_FILE_CONTENTS}")
+      string(REGEX REPLACE "define CUDNN_MINOR * +([0-9]+)" "\\1"
+             CUDNN_VERSION_MINOR "${CUDNN_VERSION_MINOR}")
+      string(REGEX MATCH "define CUDNN_PATCHLEVEL * +([0-9]+)"
+             CUDNN_VERSION_PATCH "${CUDNN_VERSION_FILE_CONTENTS}")
+      string(REGEX REPLACE "define CUDNN_PATCHLEVEL * +([0-9]+)" "\\1"
+             CUDNN_VERSION_PATCH "${CUDNN_VERSION_PATCH}")
+      if(NOT CUDNN_VERSION_MAJOR)
+        set(CUDNN_VERSION "???")
+      else()
+        set(CUDNN_VERSION "${CUDNN_VERSION_MAJOR}.${CUDNN_VERSION_MINOR}.${CUDNN_VERSION_PATCH}")
+      endif()
+      message(STATUS "cudnn library: ${cudnn_LIBRARY} (found version: \"${CUDNN_VERSION}\")")
+    endif (NOT cudnn_LIBRARY)
 
-    find_library(culibos_STATIC_LIBRARY NAMES libculibos.a PATHS ${tensorflow_PATH_STATIC_LIB} ${CUDA_TOOLKIT_ROOT_DIR})
-    if (NOT culibos_STATIC_LIBRARY)
+    if (tensorflow_BUILD_SHARED_LIB)
+      # shared first (if exists) else static one
+      find_library(culibos_LIBRARY NAMES libculibos.so libculibos.a PATHS ${tensorflow_PATH_CUDA_LIB} ${CUDA_TOOLKIT_ROOT_DIR})
+    else (tensorflow_BUILD_SHARED_LIB)
+      # only static version
+      find_library(culibos_LIBRARY NAMES libculibos.a PATHS ${tensorflow_PATH_CUDA_LIB} ${CUDA_TOOLKIT_ROOT_DIR})
+    endif (tensorflow_BUILD_SHARED_LIB)
+    if (NOT culibos_LIBRARY)
       message(FATAL_ERROR "CULIBOS is required for GPU-build")
-    else (NOT culibos_STATIC_LIBRARY)
-      message("culibos-static: ${culibos_STATIC_LIBRARY}")
-    endif (NOT culibos_STATIC_LIBRARY)
+    else (NOT culibos_LIBRARY)
+      message("culibos: ${culibos_LIBRARY}")
+    endif (NOT culibos_LIBRARY)
 
     set(CUDA_LIBRARIES ${CUDA_LIBRARIES} ${CUDA_CUDA_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_CUFFT_LIBRARIES}
-      ${CUDA_curand_LIBRARY} ${CUDA_cupti_LIBRARY} ${CUDA_cusolver_LIBRARY} ${cudnn_STATIC_LIBRARY} ${culibos_STATIC_LIBRARY} ${nccl_STATIC_LIBRARY})
+      ${CUDA_curand_LIBRARY} ${CUDA_cupti_LIBRARY} ${CUDA_cusolver_LIBRARY} ${cudnn_LIBRARY} ${culibos_LIBRARY} ${nccl_LIBRARY})
   endif (WIN32)
   include_directories(${CUDNN_INCLUDE})
 
   # Remove "." from CUDA version variable.
-  string(REPLACE "." "" short_CUDA_VER ${tensorflow_CUDA_VERSION})
+  string(REPLACE "." "" short_CUDA_VER ${CUDA_VERSION})
+
+  # List of enumerated CUDA caps
+  string(REPLACE " " ";" NVCC_ARCH_LIST "${NVCC_ARCH_FLAGS_readable}")
+  set(list ${NVCC_ARCH_LIST})
+
+  # Construct capability string
+  foreach(NVCC_ARCH ${NVCC_ARCH_LIST})
+    if (NVCC_ARCH MATCHES "sm_")
+      string(REGEX REPLACE "^.sm*" "" NVCC_ARCH ${NVCC_ARCH})
+      math(EXPR NVCC_ARCH_MAJOR "${NVCC_ARCH} / 10")
+      math(EXPR NVCC_ARCH_MINOR "(${NVCC_ARCH} - (${NVCC_ARCH_MAJOR}*10))")
+      if (TF_CUDA_CAP)
+        set(TF_CUDA_CAP "${TF_CUDA_CAP},CudaVersion(\"${NVCC_ARCH_MAJOR}.${NVCC_ARCH_MINOR}\")")
+      else (TF_CUDA_CAP)
+        set(TF_CUDA_CAP "CudaVersion(\"${NVCC_ARCH_MAJOR}.${NVCC_ARCH_MINOR}\")")
+      endif (TF_CUDA_CAP)
+    endif()
+  endforeach()
 
   # create cuda_config.h
   FILE(WRITE ${tensorflow_source_dir}/third_party/gpus/cuda/cuda_config.h
     "#ifndef CUDA_CUDA_CONFIG_H_\n"
     "#define CUDA_CUDA_CONFIG_H_\n"
-    "#define TF_CUDA_CAPABILITIES CudaVersion(\"3.7\"),CudaVersion(\"5.2\"),CudaVersion(\"6.0\"),CudaVersion(\"6.1\"),CudaVersion(\"7.0\")\n"
+    "#define TF_CUDA_CAPABILITIES ${TF_CUDA_CAP}\n"
     "#define TF_CUDA_VERSION \"64_${short_CUDA_VER}\"\n"
-    "#define TF_CUDNN_VERSION \"64_${tensorflow_CUDNN_VERSION}\"\n"
+    "#define TF_CUDNN_VERSION \"64_${CUDNN_VERSION}\"\n"
     "#define TF_CUDA_TOOLKIT_PATH \"${CUDA_TOOLKIT_ROOT_DIR}\"\n"
     "#endif  // CUDA_CUDA_CONFIG_H_\n"
   )
@@ -494,14 +536,14 @@ if (tensorflow_ENABLE_GPU)
     set(tensorflow_BUILD_INFO_FLAGS --build_config cuda --key_value
       msvcp_dll_name=msvcp140.dll
       cudart_dll_name=cudart64_${short_CUDA_VER}.dll
-      cuda_version_number=${tensorflow_CUDA_VERSION}
+      cuda_version_number=${CUDA_VERSION}
       nvcuda_dll_name=nvcuda.dll
       cudnn_dll_name=cudnn64_${tensorflow_CUDNN_VERSION}.dll
       cudnn_version_number=${tensorflow_CUDNN_VERSION})
   else(WIN32)
     set(tensorflow_BUILD_INFO_FLAGS --build_config cuda --key_value
-	    cuda_version_number=${tensorflow_CUDA_VERSION}
-	    cudnn_version_number=${tensorflow_CUDNN_VERSION})
+      cuda_version_number=${CUDA_VERSION}
+      cudnn_version_number=${tensorflow_CUDNN_VERSION})
   endif(WIN32)
 else(tensorflow_ENABLE_GPU)
   set(tensorflow_BUILD_INFO_FLAGS --build_config cpu --key_value
diff --git a/tensorflow/contrib/cmake/external/abseil_cpp.cmake b/tensorflow/contrib/cmake/external/abseil_cpp.cmake
index c6c5021f60b..4546dbdecc0 100644
--- a/tensorflow/contrib/cmake/external/abseil_cpp.cmake
+++ b/tensorflow/contrib/cmake/external/abseil_cpp.cmake
@@ -20,6 +20,7 @@ if (systemlib_ABSEIL_CPP)
                absl_dynamic_annotations
                absl_malloc_internal
                absl_throw_delegate
+               absl_int128
                absl_strings
                str_format_internal
                absl_bad_optional_access)
@@ -50,6 +51,7 @@ else (systemlib_ABSEIL_CPP)
           ${abseil_cpp_BUILD}/absl/base/Release/absl_dynamic_annotations.lib
           ${abseil_cpp_BUILD}/absl/base/Release/absl_malloc_internal.lib
           ${abseil_cpp_BUILD}/absl/base/Release/absl_throw_delegate.lib
+          ${abseil_cpp_BUILD}/absl/numeric/Release/absl_int128.lib
           ${abseil_cpp_BUILD}/absl/strings/Release/absl_strings.lib
           ${abseil_cpp_BUILD}/absl/strings/Release/str_format_internal.lib
           ${abseil_cpp_BUILD}/absl/types/Release/absl_bad_optional_access.lib)
@@ -60,6 +62,7 @@ else (systemlib_ABSEIL_CPP)
           ${abseil_cpp_BUILD}/absl/base/absl_dynamic_annotations.lib
           ${abseil_cpp_BUILD}/absl/base/absl_malloc_internal.lib
           ${abseil_cpp_BUILD}/absl/base/absl_throw_delegate.lib
+          ${abseil_cpp_BUILD}/absl/numeric/absl_int128.lib
           ${abseil_cpp_BUILD}/absl/strings/absl_strings.lib
           ${abseil_cpp_BUILD}/absl/strings/str_format_internal.lib
           ${abseil_cpp_BUILD}/absl/types/absl_bad_optional_access.lib)
@@ -71,6 +74,7 @@ else (systemlib_ABSEIL_CPP)
         ${abseil_cpp_BUILD}/absl/base/libabsl_dynamic_annotations.a
         ${abseil_cpp_BUILD}/absl/base/libabsl_malloc_internal.a
         ${abseil_cpp_BUILD}/absl/base/libabsl_throw_delegate.a
+        ${abseil_cpp_BUILD}/absl/numeric/libabsl_int128.a
         ${abseil_cpp_BUILD}/absl/strings/libabsl_strings.a
         ${abseil_cpp_BUILD}/absl/strings/libstr_format_internal.a
         ${abseil_cpp_BUILD}/absl/types/libabsl_bad_optional_access.a)
diff --git a/tensorflow/contrib/cmake/python_modules.txt b/tensorflow/contrib/cmake/python_modules.txt
index d94b703700c..96160568fa7 100644
--- a/tensorflow/contrib/cmake/python_modules.txt
+++ b/tensorflow/contrib/cmake/python_modules.txt
@@ -57,6 +57,7 @@ tensorflow/python/ops
 tensorflow/python/ops/distributions
 tensorflow/python/ops/linalg
 tensorflow/python/ops/losses
+tensorflow/python/ops/signal
 tensorflow/python/platform
 tensorflow/python/profiler
 tensorflow/python/profiler/internal
@@ -377,8 +378,6 @@ tensorflow/contrib/seq2seq/python/ops
 tensorflow/contrib/session_bundle
 tensorflow/contrib/session_bundle/example
 tensorflow/contrib/signal
-tensorflow/contrib/signal/python
-tensorflow/contrib/signal/python/ops
 tensorflow/contrib/slim
 tensorflow/contrib/slim/python
 tensorflow/contrib/slim/python/slim
diff --git a/tensorflow/contrib/cmake/tf_core_kernels.cmake b/tensorflow/contrib/cmake/tf_core_kernels.cmake
index 88b4a6165c0..d66e39ac07c 100644
--- a/tensorflow/contrib/cmake/tf_core_kernels.cmake
+++ b/tensorflow/contrib/cmake/tf_core_kernels.cmake
@@ -68,14 +68,6 @@ if(tensorflow_BUILD_CONTRIB_KERNELS)
       "${tensorflow_source_dir}/tensorflow/contrib/coder/kernels/range_coder_ops.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/coder/kernels/range_coder_ops_util.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/coder/ops/coder_ops.cc"
-      "${tensorflow_source_dir}/tensorflow/contrib/data/kernels/assert_next_dataset_op.cc"
-      "${tensorflow_source_dir}/tensorflow/contrib/data/kernels/csv_dataset_op.cc"
-      "${tensorflow_source_dir}/tensorflow/contrib/data/kernels/directed_interleave_dataset_op.cc"
-      "${tensorflow_source_dir}/tensorflow/contrib/data/kernels/ignore_errors_dataset_op.cc"
-      "${tensorflow_source_dir}/tensorflow/contrib/data/kernels/prefetching_kernels.cc"
-      "${tensorflow_source_dir}/tensorflow/contrib/data/kernels/threadpool_dataset_op.cc"
-      "${tensorflow_source_dir}/tensorflow/contrib/data/kernels/unique_dataset_op.cc"
-      "${tensorflow_source_dir}/tensorflow/contrib/data/ops/dataset_ops.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/factorization/kernels/clustering_ops.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/factorization/kernels/masked_matmul_ops.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/factorization/kernels/wals_solver_ops.cc"
diff --git a/tensorflow/contrib/cmake/tf_core_ops.cmake b/tensorflow/contrib/cmake/tf_core_ops.cmake
index ef337b3a15c..9cfa8b90749 100644
--- a/tensorflow/contrib/cmake/tf_core_ops.cmake
+++ b/tensorflow/contrib/cmake/tf_core_ops.cmake
@@ -89,7 +89,6 @@ GENERATE_CONTRIB_OP_LIBRARY(boosted_trees_prediction "${tensorflow_source_dir}/t
 GENERATE_CONTRIB_OP_LIBRARY(boosted_trees_quantiles "${tensorflow_source_dir}/tensorflow/contrib/boosted_trees/ops/quantile_ops.cc")
 GENERATE_CONTRIB_OP_LIBRARY(boosted_trees_stats_accumulator "${tensorflow_source_dir}/tensorflow/contrib/boosted_trees/ops/stats_accumulator_ops.cc")
 GENERATE_CONTRIB_OP_LIBRARY(coder "${tensorflow_source_dir}/tensorflow/contrib/coder/ops/coder_ops.cc")
-GENERATE_CONTRIB_OP_LIBRARY(data_dataset "${tensorflow_source_dir}/tensorflow/contrib/data/ops/dataset_ops.cc")
 GENERATE_CONTRIB_OP_LIBRARY(factorization_clustering "${tensorflow_source_dir}/tensorflow/contrib/factorization/ops/clustering_ops.cc")
 GENERATE_CONTRIB_OP_LIBRARY(factorization_factorization "${tensorflow_source_dir}/tensorflow/contrib/factorization/ops/factorization_ops.cc")
 GENERATE_CONTRIB_OP_LIBRARY(framework_variable "${tensorflow_source_dir}/tensorflow/contrib/framework/ops/variable_ops.cc")
diff --git a/tensorflow/contrib/cmake/tf_python.cmake b/tensorflow/contrib/cmake/tf_python.cmake
index ef487d3509b..df7b854afcc 100755
--- a/tensorflow/contrib/cmake/tf_python.cmake
+++ b/tensorflow/contrib/cmake/tf_python.cmake
@@ -373,8 +373,6 @@ GENERATE_PYTHON_OP_LIB("contrib_boosted_trees_stats_accumulator_ops"
   DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/boosted_trees/python/ops/gen_stats_accumulator_ops.py)
 GENERATE_PYTHON_OP_LIB("contrib_coder_ops"
   DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/coder/python/ops/gen_coder_ops.py)
-GENERATE_PYTHON_OP_LIB("contrib_data_dataset_ops"
-  DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/data/python/ops/gen_dataset_ops.py)
 GENERATE_PYTHON_OP_LIB("contrib_factorization_clustering_ops"
   DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/factorization/python/ops/gen_clustering_ops.py)
 GENERATE_PYTHON_OP_LIB("contrib_factorization_factorization_ops"
diff --git a/tensorflow/contrib/compiler/xla_test.py b/tensorflow/contrib/compiler/xla_test.py
index 8d13dc7316a..3b49755afcf 100644
--- a/tensorflow/contrib/compiler/xla_test.py
+++ b/tensorflow/contrib/compiler/xla_test.py
@@ -28,7 +28,6 @@ from tensorflow.python.ops import control_flow_util
 from tensorflow.python.ops import logging_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import state_ops
-from tensorflow.python.ops import summary_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.platform import test
 
@@ -49,7 +48,7 @@ class XLACompileContextTest(test.TestCase):
     histogram_summary = summary.histogram('histogram_summary', dummy_tensor)
     image_summary = summary.image('image_summary', dummy_tensor)
     scalar_summary = summary.scalar('scalar_summary', dummy_tensor)
-    tensor_summary = summary_ops.tensor_summary('tensor_summary', dummy_tensor)
+    tensor_summary = summary.tensor_summary('tensor_summary', dummy_tensor)
     summary.merge(
         [
             audio_summary, histogram_summary, image_summary, scalar_summary,
diff --git a/tensorflow/contrib/copy_graph/python/__init__.py b/tensorflow/contrib/copy_graph/python/__init__.py
index b9ff28eb0d7..5c1048e02a3 100644
--- a/tensorflow/contrib/copy_graph/python/__init__.py
+++ b/tensorflow/contrib/copy_graph/python/__init__.py
@@ -1,4 +1,4 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/tensorflow/contrib/copy_graph/python/util/__init__.py b/tensorflow/contrib/copy_graph/python/util/__init__.py
index b9ff28eb0d7..5c1048e02a3 100644
--- a/tensorflow/contrib/copy_graph/python/util/__init__.py
+++ b/tensorflow/contrib/copy_graph/python/util/__init__.py
@@ -1,4 +1,4 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/tensorflow/contrib/cudnn_rnn/BUILD b/tensorflow/contrib/cudnn_rnn/BUILD
index 57ffaa87e45..670b5494327 100644
--- a/tensorflow/contrib/cudnn_rnn/BUILD
+++ b/tensorflow/contrib/cudnn_rnn/BUILD
@@ -63,8 +63,8 @@ cuda_py_test(
     ],
     shard_count = 6,
     tags = [
-        "no_oss",  # b/117989214
         "noasan",  # http://b/62067814
+        "requires-gpu-sm35",
     ],
 )
 
diff --git a/tensorflow/contrib/cudnn_rnn/__init__.py b/tensorflow/contrib/cudnn_rnn/__init__.py
index 5d8c6191f8d..53202322686 100644
--- a/tensorflow/contrib/cudnn_rnn/__init__.py
+++ b/tensorflow/contrib/cudnn_rnn/__init__.py
@@ -24,6 +24,10 @@
 @@CudnnGRUSaveable
 @@CudnnRNNReluSaveable
 @@CudnnRNNTanhSaveable
+@@CudnnParamsFormatConverterLSTM
+@@CudnnParamsFormatConverterGRU
+@@CudnnParamsFormatConverterTanh
+@@CudnnParamsFormatConverterRelu
 """
 
 from __future__ import absolute_import
@@ -48,6 +52,10 @@ _allowed_symbols = [
     "CudnnGRUSaveable",
     "CudnnRNNReluSaveable",
     "CudnnRNNTanhSaveable",
+    "CudnnParamsFormatConverterLSTM",
+    "CudnnParamsFormatConverterGRU",
+    "CudnnParamsFormatConverterTanh",
+    "CudnnParamsFormatConverterRelu",
 ]
 
 remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_ops_test.py b/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_ops_test.py
index c59d3682d40..ae839108ebe 100644
--- a/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_ops_test.py
+++ b/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_ops_test.py
@@ -202,12 +202,13 @@ class CudnnRNNTestSaveRestore(TensorFlowTestCase):
           dtype=dtype)
       random_seed.set_random_seed(1234)
       params_size_t = model.params_size()
-      params = variables.Variable(
+      params = variables.VariableV1(
           random_ops.random_uniform([params_size_t], dtype=dtype),
           dtype=dtype,
           validate_shape=False)
       saveable = _CreateParamsSavable(params, model)
-      weights, biases = saveable._OpaqueParamsToCanonical()
+      weights, biases = saveable.format_converter._opaque_to_cu_canonical(
+          saveable._variables)
       reset_params = state_ops.assign(
           params,
           array_ops.zeros([params_size_t], dtype=dtype),
@@ -248,7 +249,7 @@ class CudnnRNNTestSaveRestore(TensorFlowTestCase):
       params_size_t = model.params_size()
       names = ["rnn_1", "rnn_2"]
       param_vars = [
-          variables.Variable(
+          variables.VariableV1(
               random_ops.random_uniform([params_size_t], dtype=dtype),
               dtype=dtype,
               validate_shape=False) for name in names
@@ -256,8 +257,10 @@ class CudnnRNNTestSaveRestore(TensorFlowTestCase):
       saveables = []
       for name, params in zip(names, param_vars):
         saveables.append(_CreateParamsSavable(params, model, name, name))
-      weights1, biases1 = saveables[0]._OpaqueParamsToCanonical()
-      weights2, biases2 = saveables[1]._OpaqueParamsToCanonical()
+      weights1, biases1 = saveables[0].format_converter._opaque_to_cu_canonical(
+          saveables[0]._variables)
+      weights2, biases2 = saveables[1].format_converter._opaque_to_cu_canonical(
+          saveables[1]._variables)
       reset_params = [
           state_ops.assign(
               params,
@@ -304,7 +307,7 @@ class CudnnRNNTestSaveRestore(TensorFlowTestCase):
           direction=direction,
           dtype=dtype)
       params_size_t = model.params_size()
-      params = variables.Variable(
+      params = variables.VariableV1(
           array_ops.ones([params_size_t], dtype=dtype),
           validate_shape=False,
           dtype=dtype)
@@ -422,21 +425,21 @@ class CudnnRNNTestParamsSize(TensorFlowTestCase):
           cudnn_rnn_ops.CUDNN_LSTM,
           constant_op.constant([4]), 200, 200,
           direction=cudnn_rnn_ops.CUDNN_RNN_UNIDIRECTION)
-      params_size = model.params_size()
+      _ = model.params_size()
     with self.assertRaisesRegexp(
         ValueError, "Shape must be rank 0 but is rank 1"):
       model = _CreateModel(
           cudnn_rnn_ops.CUDNN_LSTM,
           4, constant_op.constant([200]), 200,
           direction=cudnn_rnn_ops.CUDNN_RNN_UNIDIRECTION)
-      params_size = model.params_size()
+      _ = model.params_size()
     with self.assertRaisesRegexp(
         ValueError, "Shape must be rank 0 but is rank 1"):
       model = _CreateModel(
           cudnn_rnn_ops.CUDNN_LSTM,
           4, 200, constant_op.constant([200]),
           direction=cudnn_rnn_ops.CUDNN_RNN_UNIDIRECTION)
-      params_size = model.params_size()
+      _ = model.params_size()
 
 
 class CudnnRNNTestInference(TensorFlowTestCase):
@@ -458,7 +461,7 @@ class CudnnRNNTestInference(TensorFlowTestCase):
     params_size_t = model.params_size()
     input_data = array_ops.ones([seq_length, batch_size, input_size])
     input_h = array_ops.ones([num_layers * dir_count, batch_size, num_units])
-    params = variables.Variable(
+    params = variables.VariableV1(
         array_ops.ones([params_size_t]), validate_shape=False)
     if has_input_c:
       input_c = array_ops.ones([num_layers * dir_count, batch_size, num_units])
@@ -584,20 +587,20 @@ class CudnnRNNTestTraining(TensorFlowTestCase):
         dtype=dtype,
         dropout=dropout)
     params_size_t = model.params_size()
-    input_data = variables.Variable(
+    input_data = variables.VariableV1(
         random_ops.random_uniform(
             [seq_length, batch_size, input_size], dtype=dtype),
         dtype=dtype)
-    input_h = variables.Variable(
+    input_h = variables.VariableV1(
         random_ops.random_uniform(
             [num_layers * dir_count, batch_size, num_units], dtype=dtype),
         dtype=dtype)
-    params = variables.Variable(
+    params = variables.VariableV1(
         random_ops.random_uniform([params_size_t], dtype=dtype),
         validate_shape=False,
         dtype=dtype)
     if has_input_c:
-      input_c = variables.Variable(
+      input_c = variables.VariableV1(
           random_ops.random_uniform(
               [num_layers * dir_count, batch_size, num_units], dtype=dtype),
           dtype=dtype)
@@ -639,7 +642,8 @@ class CudnnRNNTestTraining(TensorFlowTestCase):
 
   @unittest.skipUnless(test.is_built_with_cuda(),
                        "Test only applicable when running on GPUs")
-  def testSimpleTraining(self):
+  def DISABLED_testSimpleTraining(self):
+    # TODO(jamesqin): fix b/117989214
     test_configs = [
         {
             "rnn_mode": cudnn_rnn_ops.CUDNN_LSTM,
diff --git a/tensorflow/contrib/cudnn_rnn/python/layers/__init__.py b/tensorflow/contrib/cudnn_rnn/python/layers/__init__.py
index f09466b631f..60229af374b 100644
--- a/tensorflow/contrib/cudnn_rnn/python/layers/__init__.py
+++ b/tensorflow/contrib/cudnn_rnn/python/layers/__init__.py
@@ -27,5 +27,10 @@ from tensorflow.contrib.cudnn_rnn.python.ops.cudnn_rnn_ops import CudnnCompatibl
 from tensorflow.contrib.cudnn_rnn.python.ops.cudnn_rnn_ops import CudnnCompatibleLSTMCell
 from tensorflow.contrib.cudnn_rnn.python.ops.cudnn_rnn_ops import CudnnGRUSaveable
 from tensorflow.contrib.cudnn_rnn.python.ops.cudnn_rnn_ops import CudnnLSTMSaveable
+from tensorflow.contrib.cudnn_rnn.python.ops.cudnn_rnn_ops import CudnnParamsFormatConverterGRU
+from tensorflow.contrib.cudnn_rnn.python.ops.cudnn_rnn_ops import CudnnParamsFormatConverterLSTM
+from tensorflow.contrib.cudnn_rnn.python.ops.cudnn_rnn_ops import CudnnParamsFormatConverterRelu
+from tensorflow.contrib.cudnn_rnn.python.ops.cudnn_rnn_ops import CudnnParamsFormatConverterTanh
 from tensorflow.contrib.cudnn_rnn.python.ops.cudnn_rnn_ops import CudnnRNNReluSaveable
 from tensorflow.contrib.cudnn_rnn.python.ops.cudnn_rnn_ops import CudnnRNNTanhSaveable
+
diff --git a/tensorflow/contrib/cudnn_rnn/python/layers/cudnn_rnn.py b/tensorflow/contrib/cudnn_rnn/python/layers/cudnn_rnn.py
index a324c6e7d76..8bbcc7cd039 100644
--- a/tensorflow/contrib/cudnn_rnn/python/layers/cudnn_rnn.py
+++ b/tensorflow/contrib/cudnn_rnn/python/layers/cudnn_rnn.py
@@ -388,11 +388,11 @@ class _CudnnRNN(base_layer.Layer):
       output_states: a tuple of tensor(s) of the same shape and structure as
         `initial_state`.
     Raises:
-      ValueError: initial_state is not a tuple.
+      TypeError: initial_state is not a tuple.
     """
     if initial_state is not None and not isinstance(initial_state, tuple):
-      raise ValueError("Invalid initial_state type: %s, expecting tuple.",
-                       type(initial_state))
+      raise TypeError("Invalid initial_state type: %s, expecting tuple." %
+                      initial_state)
     dtype = self.dtype
     inputs = ops.convert_to_tensor(inputs, dtype=dtype)
 
diff --git a/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py b/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py
index 2c92f317883..d06d0c6bdaa 100644
--- a/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py
+++ b/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py
@@ -74,7 +74,7 @@ class CudnnCompatibleLSTMCell(lstm_ops.LSTMBlockCell):
 
 
 class CudnnCompatibleGRUCell(rnn_cell_impl.GRUCell):
-  """Cudnn Compatible GRUCell.
+  r"""Cudnn Compatible GRUCell.
 
   A GRU impl akin to `tf.nn.rnn_cell.GRUCell` to use along with
   `tf.contrib.cudnn_rnn.CudnnGRU`. The latter's params can be used by
@@ -177,172 +177,60 @@ class CudnnCompatibleGRUCell(rnn_cell_impl.GRUCell):
     return new_h, new_h
 
 
-# TODO(yaozhang): make sure we only save the canonical version of params and
-# don't save the platform-specific version to avoid potential race
-# conditions where params is updated by both versions when being restored.
-# Currently, checkpointing will function properly, despite that we save both
-# versions, because Saver restores customized savables after Variables.
-# However, it is good to not rely on this restoring order of Saver and to
-# avoid unnecessary storage. Add a test to check only the canonical version is
-# saved.
-class CudnnOpaqueParamsSaveable(saver.BaseSaverBuilder.SaveableObject):
-  """Abstract SaveableObject implementation handling Cudnn opaque params."""
+class CudnnParamsFormatConverter(object):
+  """Abstract class that converts between params of Cudnn Rnn and TF Rnn."""
 
   def __init__(self,
-               opaque_params,
                num_layers,
                num_units,
                input_size,
                input_mode=CUDNN_INPUT_LINEAR_MODE,
-               direction=CUDNN_RNN_UNIDIRECTION,
-               scope=None,
-               name="cudnn_rnn_saveable"):
-    """Creates a CudnnOpaqueParamsSaveable object.
-
-       CudnnOpaqueParamsSaveable is saveable/restorable in a checkpoint file
-       and is used to save/restore the weights and biases parameters in a
-       canonical format which is directly consumable by platform-independent tf
-       RNN cells. Parameters are saved as tensors layer by layer with weight
-       tensors followed by bias tensors, and forward direction followed by
-       backward direction (if applicable). When restoring, a user could name
-       param_variables as desired, and restore weight and bias tensors to these
-       variables.
-
-       For CudnnRNNRelu or CudnnRNNTanh, there are 2 tensors per weight and per
-       bias for each layer: tensor 0 is applied to the input from the previous
-       layer and tensor 1 to the recurrent input.
-
-       For CudnnLSTM, there are 8 tensors per weight and per bias for each
-       layer: tensor 0-3 are applied to the input from the previous layer and
-       tensor 4-7 to the recurrent input. Tensor 0 and 4 are for the input gate;
-       tensor 1 and 5 the forget gate; tensor 2 and 6 the new memory gate;
-       tensor 3 and 7 the output gate.
-
-       For CudnnGRU, there are 6 tensors per weight and per bias for each layer:
-       tensor 0-2 are applied to the input from the previous layer and
-       tensor 3-5 to the recurrent input. Tensor 0 and 3 are for the reset gate;
-       tensor 1 and 4 the update gate; tensor 2 and 5 the new memory gate.
+               direction=CUDNN_RNN_UNIDIRECTION):
+    """Constructor.
 
     Args:
-      opaque_params: a variable, Cudnn RNN opaque params.
       num_layers: the number of layers for the RNN model.
       num_units: the number of units within the RNN model.
       input_size: the size of the input, it could be different from the
-          num_units.
+        num_units.
       input_mode: indicate whether there is a linear projection between the
-          input and the actual computation before the first layer. It could be
-          'linear_input', 'skip_input' or 'auto_select'.
-          'linear_input' (default) always applies a linear projection of input
-          onto RNN hidden state. (standard RNN behavior).
-          'skip_input' is only allowed when input_size == num_units;
-          'auto_select' implies 'skip_input' when input_size == num_units;
-          otherwise, it implies 'linear_input'.
+        input and the actual computation before the first layer. It could be one
+        of 'linear_input', 'skip_input' or 'auto_select'. * 'linear_input'
+        (default) always applies a linear projection of input onto RNN hidden
+        state. (standard RNN behavior). * 'skip_input' is only allowed when
+        input_size == num_units; * 'auto_select' implies 'skip_input' when
+        input_size == num_units; otherwise, it implies 'linear_input'.
       direction: the direction model that the model operates. Could be either
-          'unidirectional' or 'bidirectional'
-      scope: string of VariableScope, the scope of equivalent subgraph
-          consisting only platform-independent tf RNN cells.
-      name: the name of the CudnnOpaqueParamsSaveable object.
+        'unidirectional' or 'bidirectional'
     """
-    # Define in subclasses.
     self._num_layers = num_layers
     self._input_size = input_size
     self._num_units = num_units
     self._input_mode = input_mode
     self._direction = direction
-    if scope is not None:
-      scope_name = scope.name if isinstance(scope, vs.VariableScope) else scope
-      self._scope = scope_name or None
-    else:
-      self._scope = None
-
-    self._variables = opaque_params
     self._num_dirs = 1 if self._direction == CUDNN_RNN_UNIDIRECTION else 2
     self._num_params = (
         self._num_params_per_layer * self._num_layers * self._num_dirs)
 
-    weights, biases = self._OpaqueParamsToCanonical()
-    (weights, weight_names), (biases, bias_names) = self._TransformCanonical(
-        weights, biases)
-    # We currently don't use slice_spec. It might be useful in a distributed
-    # setting where each parameter server node stores a slice of variable,
-    # instead of having the master pull all slices and then save them.
-    slice_spec = ""
-    params = weights + biases
-    self._weight_names = weight_names
-    self._bias_names = bias_names
-    self._param_names = weight_names + bias_names
-    prefixed_param_names = weight_names + bias_names
-    if self._scope:
-      prefixed_param_names = [
-          "%s/%s" % (self._scope, pn) for pn in prefixed_param_names]
-    specs = [
-        saver.BaseSaverBuilder.SaveSpec(param, slice_spec, param_name)
-        for param, param_name in zip(params, prefixed_param_names)
-    ]
-    super(CudnnOpaqueParamsSaveable, self).__init__(
-        array_ops.identity(self._variables), specs, name)
+  def tf_canonical_to_opaque(self, tf_canonicals):
+    r"""Converts tf canonical weights to cudnn opaque param."""
+    cu_weights, cu_biases = self._tf_canonical_to_cu_canonical(tf_canonicals)
+    cu_weights = [array_ops.reshape(w, [-1]) for w in cu_weights]
+    opaque_params = self._cu_canonical_to_opaque(cu_weights, cu_biases)
+    return opaque_params
 
-  def restore(self, restored_tensors, restored_shapes):
-    weights, biases = self._ReverseTransformCanonical(restored_tensors)
-    weights = [array_ops.reshape(w, [-1]) for w in weights]
-    opaque_params = self._CanonicalToOpaqueParams(weights, biases)
+  def opaque_to_tf_canonical(self, opaque_param):
+    r"""Converts cudnn opaque param to tf canonical weights."""
+    cu_weights, cu_biases = self._opaque_to_cu_canonical(opaque_param)
+    weights, biases = self._cu_canonical_to_tf_canonical(cu_weights, cu_biases)
+    return weights, biases
 
-    return state_ops.assign(
-        self._variables, opaque_params, validate_shape=False)
-
-  def _checkpointable_save(self, save_buffer):
-    weights, biases = self._OpaqueParamsToCanonical()
-    with ops.device("gpu:0"):
-      (weights, _), (biases, _) = self._TransformCanonical(
-          weights, biases)
-    for name, tensor in zip(self._param_names, weights + biases):
-      save_buffer[name] = array_ops.identity(tensor)
-
-  def _checkpointable_restore(self, restore_buffer):
-    tensors = [array_ops.identity(restore_buffer[name])
-               for name in self._param_names]
-    return self.restore(
-        restored_tensors=tensors,
-        restored_shapes=None  # Unused
-    )
-
-  def _add_checkpointable_dependencies(self, checkpointable, dtype):
-    """Add canonical weight dependencies to `checkpointable`.
-
-    When saving or restoring, converts to or from the opaque buffer
-    format. Weights are saved and loaded in the configuration expected by
-    cuDNN-compatible cells.
-
-    Args:
-      checkpointable: An object inheriting from `CheckpointableBase` to add
-        dependencies too (typically the cuDNN `Layer`).
-      dtype: The dtype for the canonical parameter Tensors.
-    """
-    split_dependencies = split_dependency.split_dependency(
-        component_names=self._param_names,
-        component_dtypes=(dtype,) * len(self._param_names),
-        fill_save_buffer_fn=self._checkpointable_save,
-        consume_restore_buffer_fn=self._checkpointable_restore)
-    self._checkpointable_track_params(checkpointable, split_dependencies)
-
-  def _checkpointable_track_params(self, checkpointable, params):
-    """Tracks parameters in a canonical configuration."""
-    return  # NotImplementedError raised by the Layer.
-
-  def _TFCanonicalNamePrefix(self, layer, is_fwd=True):
-    if self._direction == CUDNN_RNN_UNIDIRECTION:
-      return "rnn/multi_rnn_cell/cell_%d/%s" % (layer, self._rnn_cell_name)
-    else:
-      if is_fwd:
-        return ("stack_bidirectional_rnn/cell_%d/bidirectional_rnn/fw/%s" %
-                (layer, self._rnn_cell_name))
-      else:
-        return ("stack_bidirectional_rnn/cell_%d/bidirectional_rnn/bw/%s" %
-                (layer, self._rnn_cell_name))
-
-  def _OpaqueParamsToCanonical(self):
+  def _opaque_to_cu_canonical(self, opaque_param):
     """Converts opaque params to Cudnn canonical format.
 
+    Args:
+      opaque_param: An opaque tensor storing cudnn rnn params (weights and
+        biases).
     Returns:
       2 list for weights and biases respectively.
     """
@@ -351,14 +239,14 @@ class CudnnOpaqueParamsSaveable(saver.BaseSaverBuilder.SaveableObject):
           num_layers=self._num_layers,
           num_units=self._num_units,
           input_size=self._input_size,
-          params=self._variables,
+          params=opaque_param,
           num_params=self._num_params,
           rnn_mode=self._rnn_mode,
           input_mode=self._input_mode,
           direction=self._direction)
       return (weights, biases)
 
-  def _CanonicalToOpaqueParams(self, cu_weights, cu_biases):
+  def _cu_canonical_to_opaque(self, cu_weights, cu_biases):
     """Converts from Cudnn canonical format to opaque params.
 
     Args:
@@ -378,7 +266,7 @@ class CudnnOpaqueParamsSaveable(saver.BaseSaverBuilder.SaveableObject):
           input_mode=self._input_mode,
           direction=self._direction)
 
-  def _TransformCanonical(self, cu_weights, cu_biases):
+  def _cu_canonical_to_tf_canonical(self, cu_weights, cu_biases):
     r"""Transform from Cudnn canonical to tf canonical.
 
     The elements of argument lists are laid out in the following format:
@@ -398,46 +286,43 @@ class CudnnOpaqueParamsSaveable(saver.BaseSaverBuilder.SaveableObject):
       cu_weights: a list of tensors of Cudnn canonical weights.
       cu_biases: a list of tensors of Cudnn canonical biases.
     Returns:
-      2 tuples, one for weights and the other for bias.
-      Each tuple has two lists: the 1st for transformed tf canonical tensors
-      and the 2nd for the names of the tensors under which they are saved.
+      1 tuple, tf canonical weights and biases.
     """
     tf_weights, tf_biases = [], []
-    tf_weights_names, tf_bias_names = [], []
 
     layer_weights_num = self._num_params_per_layer * self._num_dirs
     layer_biases_num = layer_weights_num
 
     for i in range(self._num_layers):
-      layer_weights = cu_weights[i * layer_weights_num:
-                                 (i + 1) * layer_weights_num]
+      layer_weights = cu_weights[i * layer_weights_num:(i + 1) *
+                                 layer_weights_num]
       layer_biases = cu_biases[i * layer_biases_num:(i + 1) * layer_biases_num]
       if self._direction == CUDNN_RNN_UNIDIRECTION:
-        prefix = self._TFCanonicalNamePrefix(i)
-        self._TransformSingleLayerCanonical(layer_weights, layer_biases, prefix,
-                                            tf_weights, tf_weights_names,
-                                            tf_biases, tf_bias_names)
+        self._cu_canonical_to_tf_canonical_single_layer(
+            layer_weights, layer_biases, tf_weights, tf_biases)
       else:
-        fw_prefix = self._TFCanonicalNamePrefix(i, is_fwd=True)
-        bw_prefix = self._TFCanonicalNamePrefix(i, is_fwd=False)
-
         fw_weights = layer_weights[:len(layer_weights) // 2]
         bw_weights = layer_weights[len(layer_weights) // 2:]
         fw_biases = layer_biases[:len(layer_biases) // 2]
         bw_biases = layer_biases[len(layer_biases) // 2:]
 
-        self._TransformSingleLayerCanonical(fw_weights, fw_biases, fw_prefix,
-                                            tf_weights, tf_weights_names,
-                                            tf_biases, tf_bias_names)
+        self._cu_canonical_to_tf_canonical_single_layer(
+            fw_weights,
+            fw_biases,
+            tf_weights,
+            tf_biases,
+        )
 
-        self._TransformSingleLayerCanonical(bw_weights, bw_biases, bw_prefix,
-                                            tf_weights, tf_weights_names,
-                                            tf_biases, tf_bias_names)
-    return (tf_weights, tf_weights_names), (tf_biases, tf_bias_names)
+        self._cu_canonical_to_tf_canonical_single_layer(
+            bw_weights,
+            bw_biases,
+            tf_weights,
+            tf_biases,
+        )
+    return (tf_weights, tf_biases)
 
-  def _TransformSingleLayerCanonical(self, cu_weights, cu_biases, prefix,
-                                     tf_weights, tf_weights_names, tf_biases,
-                                     tf_bias_names):
+  def _cu_canonical_to_tf_canonical_single_layer(self, cu_weights, cu_biases,
+                                                 tf_weights, tf_biases):
     r"""Transform single layer Cudnn canonicals to tf canonicals.
 
     The elements of cu_weights, cu_biases are laid out in the following format:
@@ -447,15 +332,12 @@ class CudnnOpaqueParamsSaveable(saver.BaseSaverBuilder.SaveableObject):
     Args:
       cu_weights: a list of tensors, single layer weights.
       cu_biases: a list of tensors, single layer biases.
-      prefix: the shared prefix of all tensor names.
       tf_weights: a list where transformed weights are stored.
-      tf_weights_names: a list where names of transformed weights are stored.
       tf_biases: a list where transformed biases are stored.
-      tf_bias_names: a list where names of transformed biases are stored.
     """
     raise NotImplementedError("Abstract method")
 
-  def _ReverseTransformCanonical(self, tf_canonicals):
+  def _tf_canonical_to_cu_canonical(self, tf_canonicals):
     r"""Transform from tf canonical to Cudnn canonical.
 
     This is the reverse routine of _TransformCanonical().
@@ -502,30 +384,27 @@ class CudnnOpaqueParamsSaveable(saver.BaseSaverBuilder.SaveableObject):
     return cu_weights, cu_biases
 
   def _cudnn_to_tf_weights(self, *cu_weights):
-    r"""Stitching cudnn canonical weights to generate tf canonical weights."""
+    r"""Stitches cudnn canonical weights to generate tf canonical weights."""
     raise NotImplementedError("Abstract method")
 
   def _tf_to_cudnn_weights(self, layer, *tf_weights):
-    r"""Reverse the operations in StitchWeights()."""
+    r"""Reverses the operations in StitchWeights()."""
     raise NotImplementedError("Abstract method")
 
   def _cudnn_to_tf_biases(self, *biases):
-    r"""Stitching cudnn canonical biases to generate tf canonical biases."""
+    r"""Stitches cudnn canonical biases to generate tf canonical biases."""
     raise NotImplementedError("Abstract method")
 
   def _tf_to_cudnn_biases(self, *tf_biases):
-    r"""Reverse the operations in StitchBiases()."""
+    r"""Reverses the operations in StitchBiases()."""
     raise NotImplementedError("Abstract method")
 
 
-class CudnnLSTMSaveable(CudnnOpaqueParamsSaveable):
-  """SaveableObject implementation handling Cudnn LSTM opaque params."""
-
+class CudnnParamsFormatConverterLSTM(CudnnParamsFormatConverter):
+  """Helper class that converts between params of Cudnn and TF LSTM."""
   _rnn_mode = CUDNN_LSTM
   _num_params_per_layer = CUDNN_LSTM_PARAMS_PER_LAYER
 
-  _rnn_cell_name = base_layer.to_snake_case(CudnnCompatibleLSTMCell.__name__)
-
   def _cudnn_to_tf_gate_params(self, *cu_gate_order):
     i_g, f_g, c_g, o_g = cu_gate_order
     return [i_g, c_g, f_g, o_g]
@@ -603,44 +482,16 @@ class CudnnLSTMSaveable(CudnnOpaqueParamsSaveable):
     # Return ifco order for Cudnn LSTM.
     return b_wi, b_wf, b_wc, b_wo, b_ri, b_rf, b_rc, b_ro
 
-  def _TransformSingleLayerCanonical(self, weights, biases, prefix, tf_weights,
-                                     tf_weights_names, tf_biases,
-                                     tf_bias_names):
-    (w,) = self._cudnn_to_tf_weights(*weights)
-    (b,) = self._cudnn_to_tf_biases(*biases)
-
+  def _cu_canonical_to_tf_canonical_single_layer(self, cu_weights, cu_biases,
+                                                 tf_weights, tf_biases):
+    (w,) = self._cudnn_to_tf_weights(*cu_weights)
+    (b,) = self._cudnn_to_tf_biases(*cu_biases)
     tf_weights.append(w)
-    tf_weights_names.append(prefix + "/kernel")
-
     tf_biases.append(b)
-    tf_bias_names.append(prefix + "/bias")
-
-  def _checkpointable_track_params(self, checkpointable, params):
-    """Track parameters for compatibility with CudnnCompatibleLSTMCell."""
-    biases = []
-    weights = []
-    for name in self._weight_names:
-      weights.append(params[name])
-    for name in self._bias_names:
-      biases.append(params[name])
-    assert len(params) == len(weights) + len(biases)
-    if len(weights) == 1 and len(biases) == 1:
-      # For single-layer cells, allow substituting a cell with no MultiRNNCell
-      # wrapping.
-      kernel, = weights  # pylint: disable=unbalanced-tuple-unpacking
-      bias, = biases  # pylint: disable=unbalanced-tuple-unpacking
-      checkpointable._track_checkpointable(kernel, name="kernel")  # pylint: disable=protected-access
-      checkpointable._track_checkpointable(bias, name="bias")  # pylint: disable=protected-access
-    assert len(biases) == len(weights)
-    for cell_index, (bias, kernel) in enumerate(zip(biases, weights)):
-      cell = checkpointable_lib.Checkpointable()
-      checkpointable._track_checkpointable(cell, name="cell-%d" % cell_index)  # pylint: disable=protected-access
-      cell.bias = bias
-      cell.kernel = kernel
 
 
-class CudnnGRUSaveable(CudnnOpaqueParamsSaveable):
-  """SaveableObject implementation handling Cudnn GRU opaque params."""
+class CudnnParamsFormatConverterGRU(CudnnParamsFormatConverter):
+  """Helper class that converts between params of Cudnn and TF GRU."""
 
   _rnn_mode = CUDNN_GRU
   _num_params_per_layer = CUDNN_GRU_PARAMS_PER_LAYER
@@ -702,29 +553,18 @@ class CudnnGRUSaveable(CudnnOpaqueParamsSaveable):
     b_ri, b_rr = array_ops.split(br, 2, axis=0)
     return b_wi, b_wr, b_wh, b_ri, b_rr, b_rh
 
-  def _TransformSingleLayerCanonical(self, weights, biases, prefix, tf_weights,
-                                     tf_weights_names, tf_biases,
-                                     tf_bias_names):
+  def _cu_canonical_to_tf_canonical_single_layer(self, cu_weights, cu_biases,
+                                                 tf_weights, tf_biases):
     # pylint: disable=invalid-name
-    W_ir, w_h, r_h = self._cudnn_to_tf_weights(*weights)
-    b_ir, b_wh, b_rh = self._cudnn_to_tf_biases(*biases)
+    W_ir, w_h, r_h = self._cudnn_to_tf_weights(*cu_weights)
+    b_ir, b_wh, b_rh = self._cudnn_to_tf_biases(*cu_biases)
     # pylint: enable=invalid-name
-
     tf_weights.extend([W_ir, w_h, r_h])
-    tf_weights_names.append(prefix + "/gates/kernel")
-    tf_weights_names.append(prefix + "/candidate/input_projection/kernel")
-    tf_weights_names.append(prefix + "/candidate/hidden_projection/kernel")
-
     tf_biases.extend([b_ir, b_wh, b_rh])
-    tf_bias_names.append(prefix + "/gates/bias")
-    tf_bias_names.append(prefix + "/candidate/input_projection/bias")
-    tf_bias_names.append(prefix + "/candidate/hidden_projection/bias")
 
 
-class CudnnRNNSimpleSaveable(CudnnLSTMSaveable):
-  """SaveableObject implementation handling Cudnn RNN Tanh opaque params."""
-
-  _rnn_cell_name = base_layer.to_snake_case(rnn_cell_impl.BasicRNNCell.__name__)
+class CudnnParamsFormatConverterBasic(CudnnParamsFormatConverterLSTM):
+  """Helper class that converts between params of Cudnn and TF Relu/Tanh RNN."""
 
   def _cudnn_to_tf_weights(self, *cu_weights):
     r"""Stitching cudnn canonical weights to generate tf canonical weights."""
@@ -766,18 +606,270 @@ class CudnnRNNSimpleSaveable(CudnnLSTMSaveable):
     return b_i, b_h
 
 
-class CudnnRNNTanhSaveable(CudnnRNNSimpleSaveable):
-  """SaveableObject implementation handling Cudnn RNN Tanh opaque params."""
+class CudnnParamsFormatConverterTanh(CudnnParamsFormatConverterBasic):
+  """Helper class that converts between params of Cudnn and TF Tanh RNN."""
   _rnn_mode = CUDNN_RNN_TANH
   _num_params_per_layer = CUDNN_RNN_TANH_PARAMS_PER_LAYER
 
 
-class CudnnRNNReluSaveable(CudnnRNNSimpleSaveable):
-  """SaveableObject implementation handling Cudnn RNN Relu opaque params."""
+class CudnnParamsFormatConverterRelu(CudnnParamsFormatConverterBasic):
+  """Helper class that converts between params of Cudnn and TF Relu RNN."""
   _rnn_mode = CUDNN_RNN_RELU
   _num_params_per_layer = CUDNN_RNN_RELU_PARAMS_PER_LAYER
 
 
+# TODO(yaozhang): make sure we only save the canonical version of params and
+# don't save the platform-specific version to avoid potential race
+# conditions where params is updated by both versions when being restored.
+# Currently, checkpointing will function properly, despite that we save both
+# versions, because Saver restores customized savables after Variables.
+# However, it is good to not rely on this restoring order of Saver and to
+# avoid unnecessary storage. Add a test to check only the canonical version is
+# saved.
+class CudnnOpaqueParamsSaveable(saver.BaseSaverBuilder.SaveableObject):
+  """Abstract SaveableObject implementation handling Cudnn opaque params."""
+
+  def __init__(self,
+               opaque_params,
+               num_layers,
+               num_units,
+               input_size,
+               input_mode=CUDNN_INPUT_LINEAR_MODE,
+               direction=CUDNN_RNN_UNIDIRECTION,
+               scope=None,
+               name="cudnn_rnn_saveable"):
+    """Creates a CudnnOpaqueParamsSaveable object.
+
+       CudnnOpaqueParamsSaveable is saveable/restorable in a checkpoint file
+       and is used to save/restore the weights and biases parameters in a
+       canonical format which is directly consumable by platform-independent tf
+       RNN cells. Parameters are saved as tensors layer by layer with weight
+       tensors followed by bias tensors, and forward direction followed by
+       backward direction (if applicable). When restoring, a user could name
+       param_variables as desired, and restore weight and bias tensors to these
+       variables.
+
+       For CudnnRNNRelu or CudnnRNNTanh, there are 2 tensors per weight and per
+       bias for each layer: tensor 0 is applied to the input from the previous
+       layer and tensor 1 to the recurrent input.
+
+       For CudnnLSTM, there are 8 tensors per weight and per bias for each
+       layer: tensor 0-3 are applied to the input from the previous layer and
+       tensor 4-7 to the recurrent input. Tensor 0 and 4 are for the input gate;
+       tensor 1 and 5 the forget gate; tensor 2 and 6 the new memory gate;
+       tensor 3 and 7 the output gate.
+
+       For CudnnGRU, there are 6 tensors per weight and per bias for each layer:
+       tensor 0-2 are applied to the input from the previous layer and
+       tensor 3-5 to the recurrent input. Tensor 0 and 3 are for the reset gate;
+       tensor 1 and 4 the update gate; tensor 2 and 5 the new memory gate.
+
+    Args:
+      opaque_params: a variable, Cudnn RNN opaque params.
+      num_layers: the number of layers for the RNN model.
+      num_units: the number of units within the RNN model.
+      input_size: the size of the input, it could be different from the
+        num_units.
+      input_mode: indicate whether there is a linear projection between the
+        input and the actual computation before the first layer. It could be
+        'linear_input', 'skip_input' or 'auto_select'. 'linear_input' (default)
+        always applies a linear projection of input onto RNN hidden state.
+        (standard RNN behavior). 'skip_input' is only allowed when input_size ==
+        num_units; 'auto_select' implies 'skip_input' when input_size ==
+        num_units; otherwise, it implies 'linear_input'.
+      direction: the direction model that the model operates. Could be either
+        'unidirectional' or 'bidirectional'
+      scope: string of VariableScope, the scope of equivalent subgraph
+        consisting only platform-independent tf RNN cells.
+      name: the name of the CudnnOpaqueParamsSaveable object.
+    """
+    # Define in subclasses.
+    self._num_layers = num_layers
+    self._input_size = input_size
+    self._num_units = num_units
+    self._input_mode = input_mode
+    self._direction = direction
+    if scope is not None:
+      scope_name = scope.name if isinstance(scope, vs.VariableScope) else scope
+      self._scope = scope_name or None
+    else:
+      self._scope = None
+
+    self._variables = opaque_params
+    self._num_dirs = 1 if self._direction == CUDNN_RNN_UNIDIRECTION else 2
+    # Defined in subclasses.
+    self._format_converter = None
+
+    tf_weights, tf_biases = (
+        self.format_converter.opaque_to_tf_canonical(self._variables))
+    tf_weight_names, tf_bias_names = self._tf_canonical_names()
+    # We currently don't use slice_spec. It might be useful in a distributed
+    # setting where each parameter server node stores a slice of variable,
+    # instead of having the master pull all slices and then save them.
+    slice_spec = ""
+    params = tf_weights + tf_biases
+    self._weight_names = tf_weight_names
+    self._bias_names = tf_bias_names
+    self._param_names = tf_weight_names + tf_bias_names
+    prefixed_param_names = tf_weight_names + tf_bias_names
+    if self._scope:
+      prefixed_param_names = [
+          "%s/%s" % (self._scope, pn) for pn in prefixed_param_names
+      ]
+    specs = [
+        saver.BaseSaverBuilder.SaveSpec(param, slice_spec, param_name)
+        for param, param_name in zip(params, prefixed_param_names)
+    ]
+    super(CudnnOpaqueParamsSaveable, self).__init__(
+        array_ops.identity(self._variables), specs, name)
+
+  @property
+  def format_converter(self):
+    if self._format_converter is None:
+      self._format_converter = self._format_converter_cls(
+          self._num_layers, self._num_units, self._input_size, self._input_mode,
+          self._direction)
+    return self._format_converter
+
+  def restore(self, restored_tensors, restored_shapes):
+    opaque_params = self.format_converter.tf_canonical_to_opaque(
+        restored_tensors)
+    return state_ops.assign(
+        self._variables, opaque_params, validate_shape=False)
+
+  def _checkpointable_save(self, save_buffer):
+    weights, biases = self.format_converter.opaque_params_to_tf_canonical(
+        self._variables)
+    for name, tensor in zip(self._param_names, weights + biases):
+      save_buffer[name] = array_ops.identity(tensor)
+
+  def _checkpointable_restore(self, restore_buffer):
+    tensors = [
+        array_ops.identity(restore_buffer[name]) for name in self._param_names
+    ]
+    return self.restore(
+        restored_tensors=tensors,
+        restored_shapes=None  # Unused
+    )
+
+  def _add_checkpointable_dependencies(self, checkpointable, dtype):
+    """Add canonical weight dependencies to `checkpointable`.
+
+    When saving or restoring, converts to or from the opaque buffer
+    format. Weights are saved and loaded in the configuration expected by
+    cuDNN-compatible cells.
+
+    Args:
+      checkpointable: An object inheriting from `CheckpointableBase` to add
+        dependencies too (typically the cuDNN `Layer`).
+      dtype: The dtype for the canonical parameter Tensors.
+    """
+    split_dependencies = split_dependency.split_dependency(
+        component_names=self._param_names,
+        component_dtypes=(dtype,) * len(self._param_names),
+        fill_save_buffer_fn=self._checkpointable_save,
+        consume_restore_buffer_fn=self._checkpointable_restore)
+    self._checkpointable_track_params(checkpointable, split_dependencies)
+
+  def _checkpointable_track_params(self, checkpointable, params):
+    """Tracks parameters in a canonical configuration."""
+    return  # NotImplementedError raised by the Layer.
+
+  def _tf_canonical_names(self):
+    tf_weights_names, tf_biases_names = [], []
+    for i in range(self._num_layers):
+      if self._direction == CUDNN_RNN_UNIDIRECTION:
+        prefix = self._tf_canonical_name_prefix(i)
+        self._tf_canonical_names_single_layer(prefix, tf_weights_names,
+                                              tf_biases_names)
+      else:
+        fwd_prefix = self._tf_canonical_name_prefix(i, is_fwd=True)
+        bak_prefix = self._tf_canonical_name_prefix(i, is_fwd=False)
+
+        self._tf_canonical_names_single_layer(fwd_prefix, tf_weights_names,
+                                              tf_biases_names)
+        self._tf_canonical_names_single_layer(bak_prefix, tf_weights_names,
+                                              tf_biases_names)
+    return tf_weights_names, tf_biases_names
+
+  def _tf_canonical_name_prefix(self, layer, is_fwd=True):
+    if self._direction == CUDNN_RNN_UNIDIRECTION:
+      return "rnn/multi_rnn_cell/cell_%d/%s" % (layer, self._rnn_cell_name)
+    else:
+      if is_fwd:
+        return ("stack_bidirectional_rnn/cell_%d/bidirectional_rnn/fw/%s" %
+                (layer, self._rnn_cell_name))
+      else:
+        return ("stack_bidirectional_rnn/cell_%d/bidirectional_rnn/bw/%s" %
+                (layer, self._rnn_cell_name))
+
+  def _tf_canonical_names_single_layer(self, prefix, tf_weights_names,
+                                       tf_biases_names):
+    raise NotImplementedError("Abstract method")
+
+
+class CudnnLSTMSaveable(CudnnOpaqueParamsSaveable):
+  """SaveableObject implementation handling Cudnn LSTM opaque params."""
+
+  _format_converter_cls = CudnnParamsFormatConverterLSTM
+  _rnn_cell_name = base_layer.to_snake_case(CudnnCompatibleLSTMCell.__name__)
+
+  def _tf_canonical_names_single_layer(self, prefix, tf_weights_names,
+                                       tf_bias_names):
+    tf_weights_names.append(prefix + "/kernel")
+    tf_bias_names.append(prefix + "/bias")
+
+  def _checkpointable_track_params(self, checkpointable, params):
+    """Track parameters for compatibility with CudnnCompatibleLSTMCell."""
+    biases = []
+    weights = []
+    for name in self._weight_names:
+      weights.append(params[name])
+    for name in self._bias_names:
+      biases.append(params[name])
+    assert len(params) == len(weights) + len(biases)
+    if len(weights) == 1 and len(biases) == 1:
+      # For single-layer cells, allow substituting a cell with no MultiRNNCell
+      # wrapping.
+      kernel, = weights  # pylint: disable=unbalanced-tuple-unpacking
+      bias, = biases  # pylint: disable=unbalanced-tuple-unpacking
+      checkpointable._track_checkpointable(kernel, name="kernel")  # pylint: disable=protected-access
+      checkpointable._track_checkpointable(bias, name="bias")  # pylint: disable=protected-access
+    assert len(biases) == len(weights)
+    for cell_index, (bias, kernel) in enumerate(zip(biases, weights)):
+      cell = checkpointable_lib.Checkpointable()
+      checkpointable._track_checkpointable(cell, name="cell-%d" % cell_index)  # pylint: disable=protected-access
+      cell.bias = bias
+      cell.kernel = kernel
+
+
+class CudnnGRUSaveable(CudnnOpaqueParamsSaveable):
+  """SaveableObject implementation handling Cudnn GRU opaque params."""
+
+  _format_converter_cls = CudnnParamsFormatConverterGRU
+  _rnn_cell_name = base_layer.to_snake_case(CudnnCompatibleGRUCell.__name__)
+
+  def _tf_canonical_names_single_layer(self, prefix, tf_weights_names,
+                                       tf_bias_names):
+    tf_weights_names.append(prefix + "/gates/kernel")
+    tf_weights_names.append(prefix + "/candidate/input_projection/kernel")
+    tf_weights_names.append(prefix + "/candidate/hidden_projection/kernel")
+
+    tf_bias_names.append(prefix + "/gates/bias")
+    tf_bias_names.append(prefix + "/candidate/input_projection/bias")
+    tf_bias_names.append(prefix + "/candidate/hidden_projection/bias")
+
+
+class CudnnRNNTanhSaveable(CudnnLSTMSaveable):
+  _format_converter_cls = CudnnParamsFormatConverterTanh
+  _rnn_cell_name = base_layer.to_snake_case(rnn_cell_impl.BasicRNNCell.__name__)
+
+
+class CudnnRNNReluSaveable(CudnnLSTMSaveable):
+  _format_converter_cls = CudnnParamsFormatConverterRelu
+  _rnn_cell_name = base_layer.to_snake_case(rnn_cell_impl.BasicRNNCell.__name__)
+
+
 _cudnn_rnn_common_doc_string = """
   Cudnn RNN has an opaque parameter buffer that can be used for inference and
   training. But it is possible that the layout of the parameter buffers
@@ -850,7 +942,7 @@ def _get_num_params(rnn_mode, num_layers, direction):
   elif rnn_mode == CUDNN_RNN_TANH:
     num_params_per_layer = CUDNN_RNN_TANH_PARAMS_PER_LAYER
   else:
-    raise ValueError("Invalid \'rnn_mode\': %s", rnn_mode)
+    raise ValueError("Invalid \'rnn_mode\': %s" % rnn_mode)
   num_params = num_layers * num_params_per_layer
   if direction != CUDNN_RNN_UNIDIRECTION:
     num_params *= 2
@@ -918,7 +1010,7 @@ def _cudnn_rnn(inputs,
       "seed2": seed2,
       "name": name
   }
-  if use_cudnn_v2 is not "1":
+  if use_cudnn_v2 != "1":
     outputs, output_h, output_c, _ = gen_cudnn_rnn_ops.cudnn_rnn(**args)
   else:
     outputs, output_h, output_c, _, _ = gen_cudnn_rnn_ops.cudnn_rnnv2(**args)
@@ -1582,7 +1674,7 @@ class _CudnnRNNNoInputC(_CudnnRNN):
     """
 
     if direction not in (CUDNN_RNN_UNIDIRECTION, CUDNN_RNN_BIDIRECTION):
-      raise ValueError("Invalid direction: %s", direction)
+      raise ValueError("Invalid direction: %s" % direction)
 
     super(_CudnnRNNNoInputC, self).__init__(
         self._rnn_mode,
diff --git a/tensorflow/contrib/distribute/README.md b/tensorflow/contrib/distribute/README.md
index f82453f3b5e..a938f8629d8 100644
--- a/tensorflow/contrib/distribute/README.md
+++ b/tensorflow/contrib/distribute/README.md
@@ -46,6 +46,9 @@ Let's see how to scale to multiple GPUs on one machine using `MirroredStrategy`
 Take a very simple model consisting of a single layer:
 
 ```python
+import tensorflow as tf
+from tensorflow import keras
+
 inputs = tf.keras.layers.Input(shape=(1,))
 predictions = tf.keras.layers.Dense(1)(inputs)
 model = tf.keras.models.Model(inputs=inputs, outputs=predictions)
@@ -90,8 +93,8 @@ Similarly, we can also call `evaluate` and `predict` as before using appropriate
 datasets.
 
 ```python
-model.evaluate(eval_dataset)
-model.predict(predict_dataset)
+model.evaluate(eval_dataset, steps=1)
+model.predict(predict_dataset, steps=1)
 ```
 
 That's all you need to train your model with Keras on multiple GPUs with
diff --git a/tensorflow/contrib/distribute/python/BUILD b/tensorflow/contrib/distribute/python/BUILD
index 22736c799d2..4094e52169a 100644
--- a/tensorflow/contrib/distribute/python/BUILD
+++ b/tensorflow/contrib/distribute/python/BUILD
@@ -374,9 +374,6 @@ cuda_py_test(
     tags = [
         "multi_and_single_gpu",
         "no_pip",
-        # TODO(b/118820960): Re-enable this test in guitar.
-        "manual",
-        "noguitar",
     ],
 )
 
@@ -470,6 +467,7 @@ cuda_py_test(
     ],
     tags = [
         "multi_and_single_gpu",
+        "no_oss",  # http://b/119349471
         "no_pip",
     ],
 )
@@ -492,6 +490,7 @@ cuda_py_test(
     ],
     tags = [
         "multi_and_single_gpu",
+        "no_oss",  # http://b/119349471
         "no_pip",
     ],
 )
@@ -757,8 +756,6 @@ cuda_py_test(
         "no_oss",  # TODO(b/117919883): Fix python error.
         "no_pip",
         "no_windows_gpu",
-        # TODO(b/118815591): Re-enable this test in guitar.)
-        "noguitar",
         "notsan",
     ],
 )
diff --git a/tensorflow/contrib/distribute/python/checkpoint_utils_test.py b/tensorflow/contrib/distribute/python/checkpoint_utils_test.py
index b311644cb22..d38bdb592a3 100644
--- a/tensorflow/contrib/distribute/python/checkpoint_utils_test.py
+++ b/tensorflow/contrib/distribute/python/checkpoint_utils_test.py
@@ -69,7 +69,7 @@ class CheckpointUtilsWithDistributionStrategyTest(
 
     with ops.Graph().as_default() as g, distribution.scope():
       if in_replica_mode:
-        distribution.call_for_each_replica(init_and_verify, g)
+        distribution.call_for_each_replica(init_and_verify, args=[g])
       else:
         init_and_verify(g)
 
diff --git a/tensorflow/contrib/distribute/python/collective_all_reduce_strategy.py b/tensorflow/contrib/distribute/python/collective_all_reduce_strategy.py
index d9339f8f75a..efa99d1fc52 100644
--- a/tensorflow/contrib/distribute/python/collective_all_reduce_strategy.py
+++ b/tensorflow/contrib/distribute/python/collective_all_reduce_strategy.py
@@ -205,7 +205,7 @@ class CollectiveAllReduceStrategy(mirrored_strategy.MirroredStrategy):
   def distribute_dataset(self, dataset_fn):
     """Distributes the dataset to each local GPU."""
     # TODO(yuefengz): shard the dataset.
-    return values.PerDeviceDataset(
+    return values.PerReplicaDataset(
         self._call_dataset_fn(dataset_fn), self._devices, True)
 
   def configure(self,
diff --git a/tensorflow/contrib/distribute/python/collective_all_reduce_strategy_test.py b/tensorflow/contrib/distribute/python/collective_all_reduce_strategy_test.py
index 19b59513d81..e3d919dd0d4 100644
--- a/tensorflow/contrib/distribute/python/collective_all_reduce_strategy_test.py
+++ b/tensorflow/contrib/distribute/python/collective_all_reduce_strategy_test.py
@@ -54,8 +54,6 @@ class CollectiveAllReduceStrategyTestBase(
     self._run_options = config_pb2.RunOptions()
     self._run_options.experimental.collective_graph_key = 6
 
-    self._sess_config = config_pb2.ConfigProto()
-
     # We use a different key_base for each test so that collective keys won't be
     # reused.
     # TODO(yuefengz, tucker): enable it to reuse collective keys in different
@@ -66,9 +64,10 @@ class CollectiveAllReduceStrategyTestBase(
   def _get_test_object(self, task_type, task_id, num_gpus=0):
     distribution = collective_all_reduce_strategy.CollectiveAllReduceStrategy(
         num_gpus_per_worker=num_gpus)
+    session_config = config_pb2.ConfigProto()
     if task_type and task_id is not None:
       distribution.configure(
-          session_config=self._sess_config,
+          session_config=session_config,
           cluster_spec=self._cluster_spec,
           task_type=task_type,
           task_id=task_id)
@@ -82,14 +81,16 @@ class CollectiveAllReduceStrategyTestBase(
     distribution._collective_keys = collective_keys
     distribution._cross_tower_ops._collective_keys = collective_keys
     if task_type and task_id is not None:
-      return distribution, 'grpc://' + self._cluster_spec[task_type][task_id]
+      return distribution, 'grpc://' + self._cluster_spec[task_type][
+          task_id], session_config
     else:
-      return distribution, ''
+      return distribution, '', session_config
 
   def _test_minimize_loss_graph(self, task_type, task_id, num_gpus):
-    d, master_target = self._get_test_object(task_type, task_id, num_gpus)
+    d, master_target, config = self._get_test_object(task_type, task_id,
+                                                     num_gpus)
     with ops.Graph().as_default(), \
-         self.cached_session(config=self._sess_config,
+         self.cached_session(config=config,
                              target=master_target) as sess, \
          d.scope():
       l = core.Dense(1, use_bias=False, name='gpu_%d' % d._num_gpus_per_worker)
@@ -117,7 +118,7 @@ class CollectiveAllReduceStrategyTestBase(
       def step():
         """Perform one optimization step."""
         # Run forward & backward to get gradients, variables list.
-        g_v = d.call_for_each_replica(grad_fn, one)
+        g_v = d.call_for_each_replica(grad_fn, args=[one])
         # Update the variables using the gradients and the update() function.
         before_list = []
         after_list = []
@@ -154,7 +155,8 @@ class CollectiveAllReduceStrategyTestBase(
       return error_after < error_before
 
   def _test_complex_model(self, task_type, task_id, num_gpus):
-    d, master_target = self._get_test_object(task_type, task_id, num_gpus)
+    d, master_target, config = self._get_test_object(task_type, task_id,
+                                                     num_gpus)
 
     def model_fn():
       """Mnist model with synthetic input."""
@@ -193,7 +195,7 @@ class CollectiveAllReduceStrategyTestBase(
       return train_op
 
     with ops.Graph().as_default(), \
-         self.cached_session(config=self._sess_config,
+         self.cached_session(config=config,
                              target=master_target) as sess:
       with d.scope():
         train_op = d.call_for_each_replica(model_fn)
@@ -204,10 +206,10 @@ class CollectiveAllReduceStrategyTestBase(
       return True
 
   def _test_variable_initialization(self, task_type, task_id, num_gpus):
-    distribution, master_target = self._get_test_object(task_type, task_id,
-                                                        num_gpus)
+    distribution, master_target, config = self._get_test_object(
+        task_type, task_id, num_gpus)
     with ops.Graph().as_default(), \
-         self.cached_session(config=self._sess_config,
+         self.cached_session(config=config,
                              target=master_target) as sess, \
          distribution.scope():
 
diff --git a/tensorflow/contrib/distribute/python/combinations.py b/tensorflow/contrib/distribute/python/combinations.py
index 63a163e76cd..a5137165403 100644
--- a/tensorflow/contrib/distribute/python/combinations.py
+++ b/tensorflow/contrib/distribute/python/combinations.py
@@ -335,17 +335,13 @@ tpu_strategy_one_step = NamedDistribution(
     "TPUOneStep", lambda: tpu_lib.TPUStrategy(
         TPUClusterResolver(""), steps_per_run=1),
     required_tpu=True)
-# Note that we disable prefetching for testing since prefetching makes
-# the input non-deterministic.
 mirrored_strategy_with_gpu_and_cpu = NamedDistribution(
     "MirroredCPUAndGPU",
-    lambda: mirrored_lib.MirroredStrategy(
-        ["/gpu:0", "/cpu:0"], prefetch_on_device=False),
+    lambda: mirrored_lib.MirroredStrategy(["/gpu:0", "/cpu:0"]),
     required_gpus=1)
 mirrored_strategy_with_two_gpus = NamedDistribution(
     "Mirrored2GPUs",
-    lambda: mirrored_lib.MirroredStrategy(
-        ["/gpu:0", "/gpu:1"], prefetch_on_device=False),
+    lambda: mirrored_lib.MirroredStrategy(["/gpu:0", "/gpu:1"]),
     required_gpus=2)
 
 
diff --git a/tensorflow/contrib/distribute/python/cross_tower_ops.py b/tensorflow/contrib/distribute/python/cross_tower_ops.py
index bae0f474d27..b5b349aa64e 100644
--- a/tensorflow/contrib/distribute/python/cross_tower_ops.py
+++ b/tensorflow/contrib/distribute/python/cross_tower_ops.py
@@ -62,26 +62,26 @@ def validate_destinations(destinations):
     raise ValueError("destinations can not be empty")
 
 
-def _make_tensor_into_per_device(input_tensor):
-  """Converts a single tensor into a PerDevice object."""
+def _make_tensor_into_per_replica(input_tensor):
+  """Converts a single tensor into a PerReplica object."""
   if isinstance(input_tensor, (tuple, list)):
-    raise ValueError("Cannot convert `input_tensor` to a `PerDevice` object, "
+    raise ValueError("Cannot convert `input_tensor` to a `PerReplica` object, "
                      "got %r but expected a object that is not a tuple or list."
                      % (input_tensor,))
-  if isinstance(input_tensor, value_lib.PerDevice):
+  if isinstance(input_tensor, value_lib.PerReplica):
     return input_tensor
 
   try:
     device = input_tensor.device
   except AttributeError:
-    raise ValueError("Cannot convert `input_tensor` to a `PerDevice` object "
+    raise ValueError("Cannot convert `input_tensor` to a `PerReplica` object "
                      "because it doesn't have device set.")
 
-  return value_lib.PerDevice({device: input_tensor})
+  return value_lib.PerReplica({device: input_tensor})
 
 
 def _normalize_value_destination_pairs(value_destination_pairs):
-  """Converts each tensor into a PerDevice object in the input list."""
+  """Converts each tensor into a PerReplica object in the input list."""
   result = []
   if not isinstance(value_destination_pairs, (list, tuple)):
     raise ValueError("`value_destination_pairs` should be a list or tuple")
@@ -93,8 +93,8 @@ def _normalize_value_destination_pairs(value_destination_pairs):
       raise ValueError("Each element of `value_destination_pairs` should be a "
                        "tuple of size 2.")
 
-    per_device = _make_tensor_into_per_device(pair[0])
-    result.append((per_device, pair[1]))
+    per_replica = _make_tensor_into_per_replica(pair[0])
+    result.append((per_replica, pair[1]))
   return result
 
 
@@ -105,7 +105,7 @@ def _validate_value_destination_pairs(value_destination_pairs):
   if not isinstance(value_destination_pairs, (list, tuple)): return False
   if not all([isinstance(pair, tuple) for pair in value_destination_pairs]):
     return False
-  if not all([isinstance(v[0], value_lib.PerDevice)
+  if not all([isinstance(v[0], value_lib.PerReplica)
               for v in value_destination_pairs]):
     return False
   return True
@@ -149,26 +149,16 @@ def _simple_broadcast(value, destinations):
   return value_lib.Mirrored(index)
 
 
-def _simple_reduce(per_device_value, reduce_to_device, accumulation_fn,
+def _simple_reduce(per_replica_value, reduce_to_device, accumulation_fn,
                    aggregation):
   # pylint: disable=g-missing-docstring
   all_values = []
   count = 0
-  for v in per_device_value._index.values():  # pylint: disable=protected-access
-    if isinstance(v, value_lib.MapOutput):
-      v_list = v.get()
-      if not v_list:
-        continue
-      count += len(v_list)
-      # Sum within each device before aggregating across devices.
-      # TODO(yuefengz): Check whether it helps to use accumulation_fn here.
-      v = cross_tower_utils.aggregate_tensors_or_indexed_slices(
-          v_list, math_ops.add_n)
-    else:
-      count += 1
+  for v in per_replica_value._index.values():  # pylint: disable=protected-access
+    count += 1
     all_values.append(v)
   if not all_values:
-    raise ValueError("`per_device_value` must be non-empty")
+    raise ValueError("`per_replica_value` must be non-empty")
 
   with ops.device(reduce_to_device):
     with context.context().device_policy(context.DEVICE_PLACEMENT_SILENT):
@@ -189,8 +179,8 @@ class CrossDeviceOps(object):
   def __init__(self):
     pass
 
-  def reduce(self, aggregation, per_device_value, destinations):
-    """Reduce `per_device_value` to `destinations`.
+  def reduce(self, aggregation, per_replica_value, destinations):
+    """Reduce `per_replica_value` to `destinations`.
 
     It runs the reduction operation defined by `aggregation` and put the
     result on `destinations`.
@@ -198,23 +188,23 @@ class CrossDeviceOps(object):
     Args:
       aggregation: Indicates how a variable will be aggregated. Accepted values
         are `tf.VariableAggregation.SUM`, `tf.VariableAggregation.MEAN`.
-      per_device_value: a PerDevice object or a tensor with device set.
+      per_replica_value: a PerReplica object or a tensor with device set.
       destinations: the reduction destinations.
 
     Returns:
       a Mirrored object.
 
     Raises:
-      ValueError: if per_device_value is not a PerDevice object.
+      ValueError: if per_replica_value is not a PerReplica object.
     """
-    if not isinstance(per_device_value, value_lib.PerDevice):
-      per_device_value = _make_tensor_into_per_device(per_device_value)
+    if not isinstance(per_replica_value, value_lib.PerReplica):
+      per_replica_value = _make_tensor_into_per_replica(per_replica_value)
 
     validate_destinations(destinations)
-    return self._reduce(aggregation, per_device_value, destinations)
+    return self._reduce(aggregation, per_replica_value, destinations)
 
   def batch_reduce(self, aggregation, value_destination_pairs):
-    """Reduce PerDevice objects in a batch.
+    """Reduce PerReplica objects in a batch.
 
     Reduce each first element in `value_destination_pairs` to each second
     element which indicates the destinations.
@@ -222,7 +212,7 @@ class CrossDeviceOps(object):
     Args:
       aggregation: Indicates how a variable will be aggregated. Accepted values
         are `tf.VariableAggregation.SUM`, `tf.VariableAggregation.MEAN`.
-      value_destination_pairs: a list or a tuple of tuples of PerDevice objects
+      value_destination_pairs: a list or a tuple of tuples of PerReplica objects
         (or tensors with device set if there is one device) and destinations.
 
     Returns:
@@ -230,11 +220,11 @@ class CrossDeviceOps(object):
 
     Raises:
       ValueError: if `value_destination_pairs` is not a list or a tuple of
-        tuples of PerDevice objects and destinations
+        tuples of PerReplica objects and destinations
     """
     if not _validate_value_destination_pairs(value_destination_pairs):
       # If the first element of each pair is a tensor, we try to turn it into a
-      # PerDevice object.
+      # PerReplica object.
       value_destination_pairs = _normalize_value_destination_pairs(
           value_destination_pairs)
 
@@ -256,7 +246,7 @@ class CrossDeviceOps(object):
     validate_destinations(destinations)
     return self._broadcast(tensor, destinations)
 
-  def _reduce(self, aggregation, per_device_value, destinations):
+  def _reduce(self, aggregation, per_replica_value, destinations):
     raise NotImplementedError(
         "_reduce method must be implemented in descendants.")
 
@@ -286,13 +276,13 @@ class ReductionToOneDeviceCrossDeviceOps(CrossDeviceOps):
     self.accumulation_fn = accumulation_fn
     super(ReductionToOneDeviceCrossDeviceOps, self).__init__()
 
-  def _reduce(self, aggregation, per_device_value, destinations):
+  def _reduce(self, aggregation, per_replica_value, destinations):
     if check_destinations(destinations):
       devices = get_devices_from(destinations)
     else:
-      devices = get_devices_from(per_device_value)
+      devices = get_devices_from(per_replica_value)
     reduce_to_device = self.reduce_to_device or devices[0]
-    reduced = _simple_reduce(per_device_value, reduce_to_device,
+    reduced = _simple_reduce(per_replica_value, reduce_to_device,
                              self.accumulation_fn, aggregation)
     return self.broadcast(reduced, devices)
 
@@ -303,7 +293,7 @@ class ReductionToOneDeviceCrossDeviceOps(CrossDeviceOps):
     ]
 
 
-def _group_value_by_device(per_device_values):
+def _group_value_by_device(per_replica_values):
   """Group values into sublists by their devices.
 
   This grouping is needed to call the all-reduce library because it expects a
@@ -315,18 +305,18 @@ def _group_value_by_device(per_device_values):
     ]
 
   Args:
-    per_device_values: a list of PerDevice obejcts.
+    per_replica_values: a list of PerReplica obejcts.
 
   Returns:
     a list of lists, each sublist has components for its corresponding device of
-      PerDevice objects, paired with a None.
+      PerReplica objects, paired with a None.
   """
-  destinations = per_device_values[0].devices
+  destinations = per_replica_values[0].devices
   grouped = [[] for _ in range(len(destinations))]
-  for per_device_value in per_device_values:
+  for per_replica_value in per_replica_values:
     # pylint: disable=protected-access
-    for i, v in enumerate(per_device_value._index.values()):
-      assert per_device_value.devices == destinations
+    for i, v in enumerate(per_replica_value._index.values()):
+      assert per_replica_value.devices == destinations
       grouped[i].append((v, None))
   return grouped
 
@@ -354,8 +344,8 @@ def _ungroup_and_make_mirrored(grouped_reduced,
     a list of Mirrored objects.
   """
   index = [{} for _ in range(len(grouped_reduced[0]))]
-  for d, per_device_reduced in enumerate(grouped_reduced):
-    for i, (v, _) in enumerate(per_device_reduced):
+  for d, per_replica_reduced in enumerate(grouped_reduced):
+    for i, (v, _) in enumerate(per_replica_reduced):
       if aggregation == vs.VariableAggregation.MEAN:
         index[i][destinations[d]] = v / (
             len(destinations) * num_between_graph_workers)
@@ -567,13 +557,13 @@ class AllReduceCrossDeviceOps(CrossDeviceOps):
     self._agg_small_grads_max_group = agg_small_grads_max_group
     super(AllReduceCrossDeviceOps, self).__init__()
 
-  def _reduce(self, aggregation, per_device_value, destinations):
+  def _reduce(self, aggregation, per_replica_value, destinations):
     contains_indexed_slices = cross_tower_utils.contains_indexed_slices(
-        per_device_value)
-    if (_devices_match(per_device_value, destinations)
+        per_replica_value)
+    if (_devices_match(per_replica_value, destinations)
         and not context.executing_eagerly()
         and not contains_indexed_slices):
-      return self._batch_all_reduce(aggregation, [per_device_value])[0]
+      return self._batch_all_reduce(aggregation, [per_replica_value])[0]
     else:
       if contains_indexed_slices:
         logging.log_first_n(
@@ -583,9 +573,9 @@ class AllReduceCrossDeviceOps(CrossDeviceOps):
       if check_destinations(destinations):
         devices = get_devices_from(destinations)
       else:
-        devices = get_devices_from(per_device_value)
+        devices = get_devices_from(per_replica_value)
       reduce_to_device = devices[0]
-      reduced = _simple_reduce(per_device_value, reduce_to_device,
+      reduced = _simple_reduce(per_replica_value, reduce_to_device,
                                math_ops.add_n, aggregation)
       return self.broadcast(reduced, devices)
 
@@ -609,16 +599,16 @@ class AllReduceCrossDeviceOps(CrossDeviceOps):
           for t, v in value_destination_pairs
       ]
 
-  def _batch_all_reduce(self, aggregation, per_device_values):
+  def _batch_all_reduce(self, aggregation, per_replica_values):
     """All reduce algorithm in a batch."""
     logging.log_first_n(
         logging.INFO, "batch_all_reduce invoked for batches size = %d with "
         "algorithm = %s, num_packs = %d, agg_small_grads_max_bytes = %d and "
         "agg_small_grads_max_group = %d" %
-        (len(per_device_values), self._all_reduce_alg, self._num_packs,
+        (len(per_replica_values), self._all_reduce_alg, self._num_packs,
          self._agg_small_grads_max_bytes, self._agg_small_grads_max_group), 10)
-    destinations = per_device_values[0].devices
-    grouped = _group_value_by_device(per_device_values)
+    destinations = per_replica_values[0].devices
+    grouped = _group_value_by_device(per_replica_values)
 
     device_grad_packs, tensor_packer = _pack_tensors(
         grouped, self._num_packs, self._agg_small_grads_max_bytes,
@@ -639,7 +629,7 @@ class AllReduceCrossDeviceOps(CrossDeviceOps):
               destinations, device_grad_packs))
 
     reduced = _unpack_tensors(reduced, tensor_packer)
-    return _ungroup_and_make_mirrored(reduced, per_device_values[0].devices,
+    return _ungroup_and_make_mirrored(reduced, per_replica_values[0].devices,
                                       aggregation)
 
 
@@ -723,18 +713,18 @@ class MultiWorkerAllReduce(AllReduceCrossDeviceOps):
           validate_and_complete_spec(spec) for spec in all_reduce_spec
       ]
 
-  def _batch_all_reduce(self, aggregation, per_device_values):
+  def _batch_all_reduce(self, aggregation, per_replica_values):
     """All reduce algorithm in a batch."""
     logging.log_first_n(
         logging.INFO,
         "distributed batch_all_reduce invoked for batches size = %d with "
         "allreduce_spec = %r, num_packs = %d, agg_small_grads_max_bytes = %d "
         "and agg_small_grads_max_group = %d" %
-        (len(per_device_values), self._all_reduce_spec, self._num_packs,
+        (len(per_replica_values), self._all_reduce_spec, self._num_packs,
          self._agg_small_grads_max_bytes, self._agg_small_grads_max_group), 10)
 
-    destinations = sorted(per_device_values[0].devices)
-    device_grads = _group_value_by_device(per_device_values)
+    destinations = sorted(per_replica_values[0].devices)
+    device_grads = _group_value_by_device(per_replica_values)
 
     # The all reduce library requires fully defined shapes.
     # TODO(yuefengz): when tensor sharding is not needed, static shapes are not
@@ -805,16 +795,16 @@ class CollectiveAllReduce(CrossDeviceOps):
     super(CollectiveAllReduce, self).__init__()
 
   # TODO(yuefengz, tucker): is indexed slices supported by collective ops?
-  def _reduce(self, aggregation, per_device_value, destinations):
-    if cross_tower_utils.contains_indexed_slices(per_device_value):
+  def _reduce(self, aggregation, per_replica_value, destinations):
+    if cross_tower_utils.contains_indexed_slices(per_replica_value):
       raise ValueError(
           "`IndexSlices` is not supported for Collective All-Reduce.")
     if context.executing_eagerly():
       raise ValueError(
           "Eager execution is not supported for Collective All-Reduce")
 
-    all_reduced = self._batch_all_reduce(aggregation, [per_device_value])[0]
-    if _devices_match(per_device_value, destinations):
+    all_reduced = self._batch_all_reduce(aggregation, [per_replica_value])[0]
+    if _devices_match(per_replica_value, destinations):
       return all_reduced
     else:
       index = {}
@@ -852,7 +842,7 @@ class CollectiveAllReduce(CrossDeviceOps):
           for t, v in value_destination_pairs
       ]
 
-  def _batch_all_reduce(self, aggregation, per_device_values):
+  def _batch_all_reduce(self, aggregation, per_replica_values):
     """All-reduce across all workers in a batch."""
     if context.executing_eagerly():
       raise ValueError(
@@ -860,9 +850,9 @@ class CollectiveAllReduce(CrossDeviceOps):
 
     logging.log_first_n(
         logging.INFO, "Collective All-reduce invoked with batches size = %d, "
-        "num_workers = %d" % (len(per_device_values), self._num_workers), 10)
+        "num_workers = %d" % (len(per_replica_values), self._num_workers), 10)
 
-    grouped_by_device = _group_value_by_device(per_device_values)
+    grouped_by_device = _group_value_by_device(per_replica_values)
 
     grouped_by_var = list(zip(*grouped_by_device))
     # grouped_by_var is grouped by variables and takes the following format:
@@ -892,7 +882,7 @@ class CollectiveAllReduce(CrossDeviceOps):
     new_device_grads = [list(x) for x in zip(*reduced_gv_list)]
     return _ungroup_and_make_mirrored(
         new_device_grads,
-        per_device_values[0].devices,
+        per_replica_values[0].devices,
         aggregation,
         num_between_graph_workers=self._num_workers)
 
diff --git a/tensorflow/contrib/distribute/python/cross_tower_ops_test.py b/tensorflow/contrib/distribute/python/cross_tower_ops_test.py
index 6a9e8e00c02..3e274ba67ca 100644
--- a/tensorflow/contrib/distribute/python/cross_tower_ops_test.py
+++ b/tensorflow/contrib/distribute/python/cross_tower_ops_test.py
@@ -40,12 +40,12 @@ from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.training import device_util
 
 
-def _make_per_device(values, devices, regroup=False):
+def _make_per_replica(values, devices, regroup=False):
   devices = cross_tower_ops_lib.get_devices_from(devices)
   assert len(values) == len(devices)
 
-  # We simulate the result of regroup called on PerDevice which strips the
-  # PerDevice wrapper if it has only one value.
+  # We simulate the result of regroup called on PerReplica which strips the
+  # PerReplica wrapper if it has only one value.
   if len(values) == 1 and regroup:
     with ops.device(devices[0]):
       placed_v = array_ops.identity(values[0])
@@ -56,7 +56,7 @@ def _make_per_device(values, devices, regroup=False):
     with ops.device(d):
       placed_v = array_ops.identity(v)
     index[d] = placed_v
-  return value_lib.PerDevice(index)
+  return value_lib.PerReplica(index)
 
 
 # pylint: disable=g-doc-args,g-doc-return-or-yield
@@ -122,11 +122,11 @@ class CrossDeviceOpsTestBase(test.TestCase, parameterized.TestCase):
     devices = distribution.worker_devices
 
     values = [constant_op.constant(float(d)) for d in range(len(devices))]
-    per_device = _make_per_device(values, devices)
+    per_replica = _make_per_replica(values, devices)
     mean = (len(devices) - 1.) / 2.
 
     values_2 = [constant_op.constant(d + 1.0) for d in range(len(devices))]
-    per_device_2 = _make_per_device(values_2, devices)
+    per_replica_2 = _make_per_replica(values_2, devices)
     mean_2 = mean + 1.
 
     destination_mirrored = _fake_mirrored(1., devices)
@@ -144,39 +144,41 @@ class CrossDeviceOpsTestBase(test.TestCase, parameterized.TestCase):
       self._assert_values_equal(
           cross_tower_ops.reduce(
               vs.VariableAggregation.MEAN,
-              per_device,
+              per_replica,
               destinations=destinations),
           _fake_mirrored(mean, destinations))
       self._assert_values_equal(
           cross_tower_ops.reduce(
               vs.VariableAggregation.MEAN,
-              per_device_2,
+              per_replica_2,
               destinations=destinations),
           _fake_mirrored(mean_2, destinations))
       self._assert_values_equal(
           cross_tower_ops.reduce(
-              vs.VariableAggregation.SUM, per_device,
+              vs.VariableAggregation.SUM, per_replica,
               destinations=destinations),
           _fake_mirrored(mean * len(devices), destinations))
       self._assert_values_equal(
           cross_tower_ops.reduce(
               vs.VariableAggregation.SUM,
-              per_device_2,
+              per_replica_2,
               destinations=destinations),
           _fake_mirrored(mean_2 * len(devices), destinations))
 
     # test batch_reduce()
     for d1, d2 in itertools.product(all_destinations, all_destinations):
       self._assert_values_equal(
-          cross_tower_ops.batch_reduce(vs.VariableAggregation.MEAN,
-                                       [(per_device, d1), (per_device_2, d2)]),
+          cross_tower_ops.batch_reduce(
+              vs.VariableAggregation.MEAN,
+              [(per_replica, d1), (per_replica_2, d2)]),
           [
               _fake_mirrored(mean, d1),
               _fake_mirrored(mean_2, d2)
           ])
       self._assert_values_equal(
-          cross_tower_ops.batch_reduce(vs.VariableAggregation.SUM,
-                                       [(per_device, d1), (per_device_2, d2)]),
+          cross_tower_ops.batch_reduce(
+              vs.VariableAggregation.SUM,
+              [(per_replica, d1), (per_replica_2, d2)]),
           [
               _fake_mirrored(mean * len(devices), d1),
               _fake_mirrored(mean_2 * len(devices), d2)
@@ -277,9 +279,9 @@ class SingleWorkerCrossDeviceOpsTest(CrossDeviceOpsTestBase):
     devices = ["/cpu:0", "/gpu:0"]
     t0 = _make_indexed_slices([[1., 2.]], [1], [5, 2], devices[0])
     t1 = _make_indexed_slices([[3., 4.], [5., 6.]], [1, 3], [5, 2], devices[1])
-    per_device = value_lib.PerDevice({devices[0]: t0, devices[1]: t1})
+    per_replica = value_lib.PerReplica({devices[0]: t0, devices[1]: t1})
     result = cross_tower_ops_lib._simple_reduce(
-        per_device, devices[0], math_ops.add_n, vs.VariableAggregation.SUM)
+        per_replica, devices[0], math_ops.add_n, vs.VariableAggregation.SUM)
 
     # Test that the result is semantically equal to both the concatenated
     # IndexedSlices with and without duplicate indices.
@@ -311,13 +313,14 @@ class SingleWorkerCrossDeviceOpsTest(CrossDeviceOpsTestBase):
     t0 = _make_indexed_slices([[1., 2.]], [1], dense_shape, devices[0])
     t1 = _make_indexed_slices(
         [[3., 4.], [5., 6.]], [1, 3], dense_shape, devices[1])
-    per_device = value_lib.PerDevice({devices[0]: t0, devices[1]: t1})
+    per_replica = value_lib.PerReplica({devices[0]: t0, devices[1]: t1})
 
     if batch_reduce:
-      result = cross_tower_ops_instance.batch_reduce(aggregation,
-                                                     [(per_device, devices)])
+      result = cross_tower_ops_instance.batch_reduce(
+          aggregation, [(per_replica, devices)])
     else:
-      result = cross_tower_ops_instance.reduce(aggregation, per_device, devices)
+      result = cross_tower_ops_instance.reduce(
+          aggregation, per_replica, devices)
 
     total_indices_with_dups = [1, 1, 3]
     total_indices_without_dups = [1, 3]
@@ -478,11 +481,11 @@ class MultiWorkerCollectiveAllReduceTest(
       # Collective ops doesn't support scalar tensors, so we have to construct
       # 1-d tensors.
       values = [constant_op.constant([float(d)]) for d in range(len(devices))]
-      per_device = _make_per_device(values, devices, regroup=True)
+      per_replica = _make_per_replica(values, devices, regroup=True)
       mean = np.array([(len(devices) - 1.) / 2.])
 
       values_2 = [constant_op.constant([d + 1.0]) for d in range(len(devices))]
-      per_device_2 = _make_per_device(values_2, devices)
+      per_replica_2 = _make_per_replica(values_2, devices)
       mean_2 = np.array([mean[0] + 1.])
 
       destination_mirrored = _fake_mirrored(1., devices)
@@ -500,26 +503,26 @@ class MultiWorkerCollectiveAllReduceTest(
         self._assert_values_equal(
             collective_all_reduce.reduce(
                 vs.VariableAggregation.MEAN,
-                per_device,
+                per_replica,
                 destinations=destinations),
             _fake_mirrored(mean, destinations), sess)
         self._assert_values_equal(
             collective_all_reduce.reduce(
                 vs.VariableAggregation.MEAN,
-                per_device_2,
+                per_replica_2,
                 destinations=destinations),
             _fake_mirrored(mean_2, destinations), sess)
         self._assert_values_equal(
             collective_all_reduce.reduce(
                 vs.VariableAggregation.SUM,
-                per_device,
+                per_replica,
                 destinations=destinations),
             _fake_mirrored(mean * len(devices) * num_workers, destinations),
             sess)
         self._assert_values_equal(
             collective_all_reduce.reduce(
                 vs.VariableAggregation.SUM,
-                per_device_2,
+                per_replica_2,
                 destinations=destinations),
             _fake_mirrored(mean_2 * len(devices) * num_workers, destinations),
             sess)
@@ -528,16 +531,16 @@ class MultiWorkerCollectiveAllReduceTest(
       for d1, d2 in itertools.product(all_destinations, all_destinations):
         self._assert_values_equal(
             collective_all_reduce.batch_reduce(vs.VariableAggregation.MEAN,
-                                               [(per_device, d1),
-                                                (per_device_2, d2)]),
+                                               [(per_replica, d1),
+                                                (per_replica_2, d2)]),
             [
                 _fake_mirrored(mean, d1),
                 _fake_mirrored(mean_2, d2)
             ], sess)
         self._assert_values_equal(
             collective_all_reduce.batch_reduce(vs.VariableAggregation.SUM,
-                                               [(per_device, d1),
-                                                (per_device_2, d2)]),
+                                               [(per_replica, d1),
+                                                (per_replica_2, d2)]),
             [
                 _fake_mirrored(mean * len(devices) * num_workers, d1),
                 _fake_mirrored(mean_2 * len(devices) * num_workers, d2)
diff --git a/tensorflow/contrib/distribute/python/cross_tower_utils.py b/tensorflow/contrib/distribute/python/cross_tower_utils.py
index 35324d15d44..50b3cf31e59 100644
--- a/tensorflow/contrib/distribute/python/cross_tower_utils.py
+++ b/tensorflow/contrib/distribute/python/cross_tower_utils.py
@@ -667,7 +667,5 @@ def contains_indexed_slices(value):
     return any(contains_indexed_slices(v) for v in value)
   elif isinstance(value, value_lib.DistributedValues):
     return contains_indexed_slices(list(value._index.values()))  # pylint: disable=protected-access
-  elif isinstance(value, value_lib.MapOutput):
-    return contains_indexed_slices(value.get())
   else:
     return False
diff --git a/tensorflow/contrib/distribute/python/cross_tower_utils_test.py b/tensorflow/contrib/distribute/python/cross_tower_utils_test.py
index d25964fa41a..e46240abbfa 100644
--- a/tensorflow/contrib/distribute/python/cross_tower_utils_test.py
+++ b/tensorflow/contrib/distribute/python/cross_tower_utils_test.py
@@ -98,24 +98,13 @@ class IndexedSlicesUtilsTest(test.TestCase, parameterized.TestCase):
     self.assertTrue(cross_tower_utils.contains_indexed_slices((t0, t1)))
 
   @test_util.run_in_graph_and_eager_modes
-  def testContainsIndexedSlices_PerDevice(self):
+  def testContainsIndexedSlices_PerReplica(self):
     t0 = math_ops._as_indexed_slices(
         constant_op.constant([[1., 2.], [0, 0], [3., 4.]]))
     t1 = math_ops._as_indexed_slices(
         constant_op.constant([[0., 0.], [5, 6], [7., 8.]]))
-    per_device = value_lib.PerDevice({"/gpu:0": t0, "/cpu:0": t1})
-    self.assertTrue(cross_tower_utils.contains_indexed_slices(per_device))
-
-  @test_util.run_in_graph_and_eager_modes
-  def testContainsIndexedSlices_PerDeviceMapOutput(self):
-    t0 = math_ops._as_indexed_slices(
-        constant_op.constant([[1., 2.], [0, 0], [3., 4.]]))
-    t1 = math_ops._as_indexed_slices(
-        constant_op.constant([[0., 0.], [5, 6], [7., 8.]]))
-    per_device = value_lib.PerDevice({
-        "/gpu:0": value_lib.MapOutput([t0]),
-        "/cpu:0": value_lib.MapOutput([t1])})
-    self.assertTrue(cross_tower_utils.contains_indexed_slices(per_device))
+    per_replica = value_lib.PerReplica({"/gpu:0": t0, "/cpu:0": t1})
+    self.assertTrue(cross_tower_utils.contains_indexed_slices(per_replica))
 
   @combinations.generate(combinations.combine(
       mode=["graph", "eager"],
diff --git a/tensorflow/contrib/distribute/python/estimator_training_test.py b/tensorflow/contrib/distribute/python/estimator_training_test.py
index 018512ae5a2..8f82b4c92aa 100644
--- a/tensorflow/contrib/distribute/python/estimator_training_test.py
+++ b/tensorflow/contrib/distribute/python/estimator_training_test.py
@@ -300,10 +300,8 @@ class DistributeCoordinatorIntegrationTest(test.TestCase,
           required_gpus=[0, 1]))
   def test_complete_flow_standalone_client(self, train_distribute_cls,
                                            eval_distribute_cls):
-    try:
-      train_distribute = train_distribute_cls(num_gpus=context.num_gpus())
-    except TypeError:
-      train_distribute = train_distribute_cls(num_gpus_per_worker=2)
+    train_distribute = train_distribute_cls(
+        num_gpus_per_worker=context.num_gpus())
 
     if eval_distribute_cls:
       eval_distribute = eval_distribute_cls(
diff --git a/tensorflow/contrib/distribute/python/examples/keras_mnist.py b/tensorflow/contrib/distribute/python/examples/keras_mnist.py
index c7036daa3e3..0fd3acd0451 100644
--- a/tensorflow/contrib/distribute/python/examples/keras_mnist.py
+++ b/tensorflow/contrib/distribute/python/examples/keras_mnist.py
@@ -61,7 +61,6 @@ def get_input_datasets(use_bfloat16=False):
   # train dataset
   train_ds = tf.data.Dataset.from_tensor_slices((x_train, y_train))
   train_ds = train_ds.repeat()
-  train_ds = train_ds.shuffle(100)
   train_ds = train_ds.map(lambda x, y: (tf.cast(x, cast_dtype), y))
   train_ds = train_ds.batch(64, drop_remainder=True)
 
diff --git a/tensorflow/contrib/distribute/python/input_ops.py b/tensorflow/contrib/distribute/python/input_ops.py
index f07ec8234df..ac1ccd64b32 100644
--- a/tensorflow/contrib/distribute/python/input_ops.py
+++ b/tensorflow/contrib/distribute/python/input_ops.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import readers
 from tensorflow.python.data.util import nest
 from tensorflow.python.framework import ops
@@ -27,9 +28,8 @@ from tensorflow.python.platform import tf_logging
 
 # TODO(priyag): Any other reader datasets to consider here?
 _READER_DATASET_OPS = [
-    "TextLineDataset",
-    "TFRecordDataset",
-    "FixedLengthRecordDataset"
+    "TextLineDataset", "TFRecordDataset", "FixedLengthRecordDataset",
+    "FixedLengthRecordDatasetV2"
 ]
 
 
@@ -75,6 +75,8 @@ def auto_shard_dataset(dataset, num_shards, index):
         # instead of updating in-place.
         return dataset._clone(
             filenames=dataset._filenames.shard(num_shards, index))
+      elif isinstance(dataset, dataset_ops.RangeDataset):
+        return dataset.shard(num_shards, index)
       elif hasattr(dataset, "_map_func"):
         # TODO(priyag): Make this check more robust by enforcing some common
         # property on all map/flatmap/interleave datasets.
diff --git a/tensorflow/contrib/distribute/python/keras_optimizer_v2_test.py b/tensorflow/contrib/distribute/python/keras_optimizer_v2_test.py
index f4c222f26c3..46a1cf41c55 100644
--- a/tensorflow/contrib/distribute/python/keras_optimizer_v2_test.py
+++ b/tensorflow/contrib/distribute/python/keras_optimizer_v2_test.py
@@ -157,7 +157,7 @@ class MirroredStrategyOptimizerV2Test(test.TestCase):
     dist = mirrored_strategy.MirroredStrategy(devices)
     with dist.scope():
       (var, m, v, op, counter) = dist.call_for_each_replica(
-          create_fn, dist.worker_device_index, run_concurrently=False)
+          create_fn, args=[dist.worker_device_index])
       self.evaluate(variables.global_variables_initializer())
       var_val = [2.0, 2.0, 2.0]
       self.assertAllClose(
diff --git a/tensorflow/contrib/distribute/python/keras_test.py b/tensorflow/contrib/distribute/python/keras_test.py
index 4cd8ac14100..0db5844e4c4 100644
--- a/tensorflow/contrib/distribute/python/keras_test.py
+++ b/tensorflow/contrib/distribute/python/keras_test.py
@@ -47,7 +47,6 @@ _RANDOM_SEED = 1337
 _TRAIN_SIZE = 200
 _INPUT_SIZE = (10,)
 _NUM_CLASS = 2
-_TOLERANCE = 1e-5
 
 
 # TODO(anjalisridhar): Add a decorator that will allow us to run these tests as
@@ -213,10 +212,76 @@ def multi_input_output_model():
   return model
 
 
+def get_correctness_test_inputs(use_numpy, with_distribution,
+                                x_train, y_train, x_predict):
+  """Generates the inputs for correctness check when enable Keras with DS."""
+  global_batch_size = 64
+  batch_size = global_batch_size
+  # TODO(b/118776054): Use global batch size for Keras/DS support.
+  if with_distribution:
+    batch_size //= with_distribution.num_replicas_in_sync
+
+  if use_numpy:
+    training_inputs = {
+        'batch_size': batch_size,
+        'x': x_train,
+        'y': y_train,
+        'epochs': 1,
+        'shuffle': False,
+    }
+    eval_inputs = {
+        'batch_size': batch_size,
+        'x': x_train,
+        'y': y_train,
+    }
+    predict_inputs = {
+        # TODO(b/119318587): We should not require batch_size when distribution
+        # is enabled.
+        'batch_size': (len(x_predict) // with_distribution.num_replicas_in_sync
+                       if with_distribution else None),
+        'x': np.array(x_predict, dtype=np.float32),
+    }
+  else:
+    # For dataset inputs, we do not pass batch_size to
+    # keras.fit/evaluate/predict. The batch size is part of the dataset.
+    train_dataset = dataset_ops.Dataset.from_tensor_slices(
+        (x_train, y_train))
+    x = batch_wrapper(train_dataset, batch_size, with_distribution)
+
+    training_inputs = {
+        'batch_size': None,
+        'x': x,
+        'y': None,
+        'epochs': 1,
+        'shuffle': False,
+        'steps_per_epoch': len(x_train) // global_batch_size,
+    }
+    eval_inputs = {
+        'batch_size': None,
+        'x': x,
+        'y': None,
+        'steps': 20,
+    }
+    predict_batch_size = len(x_predict)
+    if with_distribution:
+      predict_batch_size //= with_distribution.num_replicas_in_sync
+    predict_dataset = dataset_ops.Dataset.from_tensor_slices(x_predict)
+    predict_dataset = batch_wrapper(predict_dataset,
+                                    predict_batch_size, with_distribution)
+    predict_inputs = {
+        'batch_size': None,
+        'steps': 1,
+        'x': predict_dataset,
+    }
+
+  return training_inputs, eval_inputs, predict_inputs
+
+
 strategies = [combinations.default_strategy,
               combinations.one_device_strategy,
               combinations.mirrored_strategy_with_gpu_and_cpu,
               combinations.mirrored_strategy_with_two_gpus,
+              combinations.tpu_strategy,  # steps_per_run=2
               combinations.tpu_strategy_one_step]
 
 
@@ -245,6 +310,13 @@ def strategy_and_optimizer_combinations():
       mode=['graph'])
 
 
+def strategy_and_inputs():
+  return combinations.combine(
+      distribution=strategies,
+      use_numpy=[True, False],
+      mode=['graph'])
+
+
 class TestEstimatorDistributionStrategy(test_util.TensorFlowTestCase):
 
   def setUp(self):
@@ -413,8 +485,8 @@ class TestDistributionStrategyWithNumpyArrays(test.TestCase,
 
       with self.assertRaisesRegexp(ValueError, 'is smaller than the number '
                                                'of replicas'):
-        # The batch size(32) * num_replicas(3) is 96 which is greater than the
-        # number of input samples(64).
+        # The batch size(32) * num_replicas_in_sync(3) is 96 which is greater
+        # than the number of input samples(64).
         distributed_training_utils.get_input_batch_params(inputs,
                                                           32,
                                                           strategy)
@@ -598,36 +670,33 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
   @combinations.generate(strategy_combinations())
   def test_model_interleaved_eval_same_as_direct_eval(self, distribution):
     with self.cached_session():
-      loss = 'mse'
-
       user_controlled_model = get_model()
-      user_controlled_optimizer = gradient_descent.GradientDescentOptimizer(
-          0.001)
-      user_controlled_metrics = ['mae', keras.metrics.CategoricalAccuracy()]
-      user_controlled_model.compile(user_controlled_optimizer, loss,
-                                    metrics=user_controlled_metrics,
-                                    distribute=distribution)
+      user_controlled_model.compile(
+          gradient_descent.GradientDescentOptimizer(0.001),
+          loss='mse',
+          metrics=['mae', keras.metrics.CategoricalAccuracy()],
+          distribute=distribution)
 
       interleaved_model = get_model()
-      interleaved_optimizer = gradient_descent.GradientDescentOptimizer(0.001)
-      interleaved_metrics = ['mae', keras.metrics.CategoricalAccuracy()]
-      interleaved_model.compile(interleaved_optimizer, loss,
-                                metrics=interleaved_metrics,
-                                distribute=distribution)
+      interleaved_model.set_weights(user_controlled_model.get_weights())
+      interleaved_model.compile(
+          gradient_descent.GradientDescentOptimizer(0.001),
+          loss='mse',
+          metrics=['mae', keras.metrics.CategoricalAccuracy()],
+          distribute=distribution)
 
       dataset = get_dataset(distribution)
 
       # Call fit with validation interleaved
-      interleaved_output = interleaved_model.fit(dataset, epochs=2,
-                                                 steps_per_epoch=2, verbose=0,
-                                                 validation_data=dataset,
-                                                 validation_steps=2)
+      interleaved_output = interleaved_model.fit(
+          dataset, epochs=2, steps_per_epoch=2, verbose=1,
+          validation_data=dataset, validation_steps=2, shuffle=False)
 
       # Manually control the validation running after each epoch.
       user_controlled_output = []
       for _ in range(2):
         user_controlled_model.fit(
-            dataset, epochs=1, steps_per_epoch=2, verbose=0)
+            dataset, epochs=1, steps_per_epoch=2, verbose=1, shuffle=False)
         user_controlled_output.append(
             user_controlled_model.evaluate(dataset, steps=2))
 
@@ -1019,26 +1088,36 @@ class TestDistributionStrategyCorrectness(test.TestCase,
           distribute=distribution)
 
       batch_size = 64
-      batch_size //= distribution.num_replicas
+      batch_size //= distribution.num_replicas_in_sync
       train_dataset = dataset_ops.Dataset.from_tensor_slices((x_train, y_train))
       train_dataset = batch_wrapper(train_dataset, batch_size, distribution)
 
       history = model.fit(x=train_dataset, epochs=1, steps_per_epoch=10)
       self.assertEqual(history.history['binary_accuracy'], [1.0])
 
-  @combinations.generate(strategy_combinations())
-  def test_correctness(self, distribution):
+  @combinations.generate(strategy_and_inputs())
+  def test_correctness(self, distribution, use_numpy):
     with self.cached_session():
+      tolerance = 1e-5
+
+      if isinstance(distribution, mirrored_strategy.MirroredStrategy):
+        # TODO(b/119257215): use the default one once the flakyness is fixed.
+        tolerance = 1e-4
+
       keras.backend.set_image_data_format('channels_last')
-      num_samples = 10000
       np.random.seed(_RANDOM_SEED)
       random_seed.set_random_seed(_RANDOM_SEED)
 
-      # Train and predict datasets are created with the same input numpy arrays.
+      # Train, eval, and predict datasets are created with the same input numpy
+      # arrays.
+      # TODO(xiejw): Change this back to 10000, once we support final partial
+      # batch.
+      num_samples = 9984
       x_train = np.random.rand(num_samples, 1)
       y_train = 3 * x_train
       x_train = x_train.astype('float32')
       y_train = y_train.astype('float32')
+      x_predict = [[1.], [2.], [3.], [4.]]
 
       # The model is built once and the initial weights are saved.
       # This is used to initialize the model for both the distribution and
@@ -1052,49 +1131,38 @@ class TestDistributionStrategyCorrectness(test.TestCase,
       initial_weights = model.get_weights()
 
       def fit_and_predict(with_distribution=None):
+        # We have initialized the model to the same weight for the distribution
+        # and non-distribution run.
         model.set_weights(initial_weights)
         model.compile(
             loss=keras.losses.mean_squared_error,
             optimizer=gradient_descent.GradientDescentOptimizer(0.5),
             distribute=with_distribution)
 
-        batch_size = 64
-        if with_distribution:
-          batch_size //= with_distribution.num_replicas
-        train_dataset = dataset_ops.Dataset.from_tensor_slices((x_train,
-                                                                y_train))
-        train_dataset = batch_wrapper(train_dataset, batch_size, distribution)
-        # We have initialized the model to the same weight for the distribution
-        # and non-distribution run. If you want to initialize the model to
-        # random weights for each run, you need to run the model through the
-        # entire dataset at least once to ensure that the weights converge to
-        # the same value.
-        model.fit(x=train_dataset, epochs=1, steps_per_epoch=10)
+        training_inputs, eval_inputs, predict_inputs = (
+            get_correctness_test_inputs(use_numpy, with_distribution,
+                                        x_train, y_train, x_predict))
 
+        model.fit(**training_inputs)
+        eval_result = model.evaluate(**eval_inputs)
         weights = model.get_weights()
-        x_predict = [[1.], [2.], [3.], [4.]]
-        predict_batch_size = 4
-        if with_distribution:
-          predict_batch_size //= with_distribution.num_replicas
-        predict_dataset = dataset_ops.Dataset.from_tensor_slices(x_predict)
-        predict_dataset = batch_wrapper(predict_dataset,
-                                        predict_batch_size, distribution)
-        predict_result = model.predict(predict_dataset, steps=1)
+        predict_result = model.predict(**predict_inputs)
 
-        return weights, predict_result
+        return weights, eval_result, predict_result
 
-      wts_with_ds, predict_with_ds = fit_and_predict(
+      wts_with_ds, eval_with_ds, predict_with_ds = fit_and_predict(
           with_distribution=distribution)
-      wts_without_ds, predict_without_ds = fit_and_predict(
+      wts_without_ds, eval_without_ds, predict_without_ds = fit_and_predict(
           with_distribution=None)
 
-      # Verify that the weights are the same within some limits of tolerance.
+      # Verify that the weights, eval results, predict outputs  are the same
+      # within some limits of tolerance.
       self.assertAllClose(
-          wts_with_ds, wts_without_ds, atol=_TOLERANCE, rtol=_TOLERANCE)
-      # Verify that the predicted outputs are the same within some limits of
-      # tolerance.
+          wts_with_ds, wts_without_ds, atol=tolerance, rtol=tolerance)
       self.assertAllClose(
-          predict_with_ds, predict_without_ds, atol=_TOLERANCE, rtol=_TOLERANCE)
+          eval_with_ds, eval_without_ds, atol=tolerance, rtol=tolerance)
+      self.assertAllClose(
+          predict_with_ds, predict_without_ds, atol=tolerance, rtol=tolerance)
 
 
 # TODO(priyag): Add a test for TPUStrategy with steps_per_run > 1.
diff --git a/tensorflow/contrib/distribute/python/metrics_v1_test.py b/tensorflow/contrib/distribute/python/metrics_v1_test.py
index 9e1a7ad3932..c28ab416518 100644
--- a/tensorflow/contrib/distribute/python/metrics_v1_test.py
+++ b/tensorflow/contrib/distribute/python/metrics_v1_test.py
@@ -100,7 +100,7 @@ class MetricsV1Test(test.TestCase, parameterized.TestCase):
       if isinstance(distribution, tpu_strategy.TPUStrategy):
         def step_fn(ctx, inputs):
           value, update = distribution.call_for_each_replica(
-              metric_fn, inputs)
+              metric_fn, args=[inputs])
           ctx.set_non_tensor_output(name="value", output=value)
           return distribution.group(update)
 
@@ -111,14 +111,14 @@ class MetricsV1Test(test.TestCase, parameterized.TestCase):
         # In each run, we run multiple steps, and each steps consumes as many
         # batches as number of replicas.
         batches_per_update = (
-            distribution.num_replicas * distribution.steps_per_run)
+            distribution.num_replicas_in_sync * distribution.steps_per_run)
       else:
         value, update = distribution.call_for_each_replica(
             metric_fn, iterator.get_next())
         update = distribution.group(update)
         # TODO(josh11b): Once we switch to using a global batch size for input,
-        # replace "distribution.num_replicas" with "1".
-        batches_per_update = distribution.num_replicas
+        # replace "distribution.num_replicas_in_sync" with "1".
+        batches_per_update = distribution.num_replicas_in_sync
 
       self.evaluate(iterator.initializer)
       self.evaluate(distribution.initialize())
diff --git a/tensorflow/contrib/distribute/python/minimize_loss_test.py b/tensorflow/contrib/distribute/python/minimize_loss_test.py
index 165732d578f..c6562463edb 100644
--- a/tensorflow/contrib/distribute/python/minimize_loss_test.py
+++ b/tensorflow/contrib/distribute/python/minimize_loss_test.py
@@ -22,7 +22,6 @@ from absl.testing import parameterized
 import numpy
 
 from tensorflow.contrib.distribute.python import combinations
-from tensorflow.contrib.distribute.python import mirrored_strategy
 from tensorflow.contrib.distribute.python.single_loss_example import batchnorm_example
 from tensorflow.contrib.distribute.python.single_loss_example import minimize_loss_example
 from tensorflow.python.data.ops import dataset_ops
@@ -67,8 +66,7 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
       def step_fn(ctx, *inputs):
         del ctx  # Unused
         return distribution.group(
-            distribution.call_for_each_replica(
-                model_fn, *inputs, run_concurrently=layer.built))
+            distribution.call_for_each_replica(model_fn, args=inputs))
 
       iterator = self._get_iterator(distribution.distribute_dataset(dataset_fn))
 
@@ -111,7 +109,7 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
       def run_step():
         return distribution.group(
             distribution.call_for_each_replica(
-                model_fn, iterator.get_next(), run_concurrently=layer.built))
+                model_fn, args=(iterator.get_next(),)))
 
       if not context.executing_eagerly():
         with self.cached_session() as sess:
@@ -162,8 +160,7 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
       def step_fn(ctx, *inputs):
         del ctx  # Unused
         return distribution.group(
-            distribution.call_for_each_replica(
-                model_fn, *inputs, run_concurrently=layer.built))
+            distribution.call_for_each_replica(model_fn, args=inputs))
 
       iterator = self._get_iterator(distribution.distribute_dataset(dataset_fn))
 
@@ -221,7 +218,7 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
                                     renorm, update_ops_in_cross_replica_mode):
     """Verifies that moving mean updates are reduced across replicas."""
     with distribution.scope():
-      num_replicas = len(distribution.worker_devices)
+      num_replicas = distribution.num_replicas_in_sync
       model_fn, dataset_fn, batchnorm = batchnorm_example(
           optimizer_fn,
           batch_per_epoch=num_replicas,
@@ -229,17 +226,10 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
           renorm=renorm,
           update_ops_in_replica_mode=not update_ops_in_cross_replica_mode)
 
-      # Make sure prefetching is disabled since that makes the
-      # specific input on each device to be non deterministic, and
-      # this test relies on specific input being on each device.
-      if isinstance(distribution, mirrored_strategy.MirroredStrategy):
-        self.assertFalse(distribution._prefetch_on_device)
-
       def step_fn(ctx, *inputs):
         del ctx  # Unused
         fetches = distribution.unwrap(
-            distribution.call_for_each_replica(
-                model_fn, *inputs, run_concurrently=batchnorm.built))
+            distribution.call_for_each_replica(model_fn, args=inputs))
         if update_ops_in_cross_replica_mode:
           fetches += ops.get_collection(ops.GraphKeys.UPDATE_OPS)
         return control_flow_ops.group(fetches)
@@ -334,8 +324,7 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
       def step_fn(ctx, x, y):
         del ctx  # Unused
         return distribution.group(
-            distribution.call_for_each_replica(
-                model_fn, x, y, run_concurrently=False))
+            distribution.call_for_each_replica(model_fn, args=(x, y)))
 
       iterator = self._get_iterator(distribution.distribute_dataset(dataset_fn))
 
@@ -369,10 +358,11 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
       # So unreplicated the update to w with lr=0.2 is -0.2 * -106 = 21.2
       # with sum loss reduction, or 10.6 with mean.
       if loss_reduction == losses_impl.Reduction.SUM:
-        # Note that the "distribution.num_replicas" factor will go away once
-        # we split the input across replicas, instead of pulling a complete
+        # Note that the "distribution.num_replicas_in_sync" factor will go away
+        # once we split the input across replicas, instead of pulling a complete
         # batch of input per replica.
-        self.assertNear(weight, 2 + 21.2 * distribution.num_replicas, 0.0001)
+        self.assertNear(weight, 2 + 21.2 * distribution.num_replicas_in_sync,
+                        0.0001)
       else:
         # One of the mean loss reductions.
         self.assertNear(weight, 2 + 10.6, 0.0001)
@@ -420,7 +410,7 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
 
       def step_fn(output_context, *inputs):
         (train_op, loss) = distribution.call_for_each_replica(
-            model_fn, output_context, *inputs, run_concurrently=False)
+            model_fn, args=(output_context,) + inputs)
         output_context.set_last_step_output(
             name="cross_replica_loss_agg",
             output=loss,
@@ -491,7 +481,7 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
   def _verify_loss_output(self, initial_loss, loss_output, aggregated,
                           distribution):
     if not aggregated:
-      self.assertEqual(distribution.num_replicas,
+      self.assertEqual(distribution.num_replicas_in_sync,
                        len(distribution.unwrap(loss_output)))
       loss_output = distribution.reduce(
           aggregation=variables_lib.VariableAggregation.MEAN,
diff --git a/tensorflow/contrib/distribute/python/mirrored_strategy.py b/tensorflow/contrib/distribute/python/mirrored_strategy.py
index c23de069498..2d75024e7a0 100644
--- a/tensorflow/contrib/distribute/python/mirrored_strategy.py
+++ b/tensorflow/contrib/distribute/python/mirrored_strategy.py
@@ -73,17 +73,14 @@ class _RequestedStop(Exception):
 
 # TODO(yuefengz): maybe create a common class for those who need to call this
 # _call_for_each_replica.
-def _call_for_each_replica(distribution, fn, *args, **kwargs):
+def _call_for_each_replica(distribution, fn, args, kwargs):
   """Run `fn` in separate threads, once per replica/worker device.
 
   Args:
     distribution: the DistributionStrategy object.
     fn: function to run (will be run once per device, each in its own thread).
-    *args: positional arguments for `fn`
-    **kwargs: keyword arguments for `fn`.
-        `"run_concurrently"`: Boolean indicating whether executions of `fn`
-           can be run concurrently (under eager execution only), defaults to
-           `True`.
+    args: positional arguments for `fn`
+    kwargs: keyword arguments for `fn`.
 
   Returns:
     Merged return value of `fn` across all replicas.
@@ -92,16 +89,12 @@ def _call_for_each_replica(distribution, fn, *args, **kwargs):
     RuntimeError: If fn() calls get_replica_context().merge_call() a different
         number of times from the available devices.
   """
-  run_concurrently = kwargs.pop("run_concurrently", True)
+  # TODO(josh11b): Add this option once we add synchronization to variable
+  # creation. Until then, this is pretty unsafe to use.
+  run_concurrently = False
   if not context.executing_eagerly():
-    # Lots of TF library code isn't thread-safe in graph mode, and
-    # there is little to be gained by turning on multithreading when
-    # constructing a graph.
-    run_concurrently = False
     # Needed for per-thread device, etc. contexts in graph mode.
     ops.get_default_graph().switch_to_thread_local()
-  elif run_concurrently is None:
-    run_concurrently = True
 
   coord = coordinator.Coordinator(clean_stop_exception_types=(_RequestedStop,))
 
@@ -192,7 +185,7 @@ def _reduce_non_distributed_value(distribution, aggregation, value,
     raise ValueError("You are passing a `DistributedValue` to "
                      "`_reduce_non_distributed_value`, which is not allowed.")
 
-  # If the same value is present on all replicas then the PerDevice value will
+  # If the same value is present on all replicas then the PerReplica value will
   # be a single value. We also handle the case when `value` is a single value
   # and equal to 0.
   if value == 0:
@@ -348,8 +341,6 @@ class MirroredStrategy(distribute_lib.DistributionStrategy):
       specified.
     cross_device_ops: optional, a descedant of `CrossDeviceOps`. If this is not
       set, the `configure` method will try to find the best one.
-    prefetch_on_device: optional boolean to specify whether to prefetch input
-      data to devices.
     auto_shard_dataset: whether to auto-shard the dataset when there are
       multiple workers.
     cross_tower_ops: Deprecated alias for `cross_device_ops`.
@@ -360,14 +351,12 @@ class MirroredStrategy(distribute_lib.DistributionStrategy):
                num_gpus=None,
                num_gpus_per_worker=None,
                cross_device_ops=None,
-               prefetch_on_device=None,
                auto_shard_dataset=False,
                cross_tower_ops=None):
     super(MirroredStrategy, self).__init__()
 
     assert not (cross_device_ops and cross_tower_ops)
     self._cross_tower_ops = cross_device_ops or cross_tower_ops
-    self._prefetch_on_device = prefetch_on_device
     self._auto_shard_dataset = auto_shard_dataset
     # Remember num GPUs which might be needed by `configure` method.
     if num_gpus is not None and num_gpus_per_worker is not None:
@@ -402,7 +391,8 @@ class MirroredStrategy(distribute_lib.DistributionStrategy):
     # TODO(josh11b): Require at least 2 devices?
     self._devices = [device_util.resolve(d) for d in devices]
     self._canonical_device_set = set(self._devices)
-    self._device_index = values.PerDevice({d: i for i, d in enumerate(devices)})
+    self._device_index = values.PerReplica(
+        {d: i for i, d in enumerate(devices)})
 
   def _initialize_multi_worker(self, num_gpus, cluster_spec):
     """Initializes the object for multi-worker training."""
@@ -417,19 +407,19 @@ class MirroredStrategy(distribute_lib.DistributionStrategy):
     if num_gpus is None:
       raise ValueError("`num_gpus` is required if `cluster_spec` is given.")
     if num_gpus > 0:
-      self._worker_device_map = {
-          worker: [
+      self._worker_devices = [
+          (worker, [
               device_util.canonicalize(worker + "/device:GPU:%d" % gpu)
               for gpu in range(num_gpus)
-          ] for worker in self._workers
-      }
+          ]) for worker in self._workers
+      ]
     else:
-      self._worker_device_map = {
-          worker: [device_util.canonicalize(worker, "/device:CPU:0")]
+      self._worker_devices = [
+          (worker, [device_util.canonicalize(worker, "/device:CPU:0")])
           for worker in self._workers
-      }
+      ]
 
-    devices = nest.flatten(self._worker_device_map)
+    devices = nest.flatten([l for _, l in self._worker_devices])
 
     # Setting `_default_device` will add a device scope in the
     # distribution.scope. We set the default device to the first worker. When
@@ -446,7 +436,7 @@ class MirroredStrategy(distribute_lib.DistributionStrategy):
     # TODO(josh11b): Require at least 2 devices?
     self._devices = [device_util.resolve(d) for d in devices]
     self._canonical_device_set = set(self._devices)
-    self._device_index = values.PerDevice(
+    self._device_index = values.PerReplica(
         {d: i for i, d in enumerate(devices)})
 
   def _create_variable(self, next_creator, *args, **kwargs):
@@ -490,12 +480,11 @@ class MirroredStrategy(distribute_lib.DistributionStrategy):
   def distribute_dataset(self, dataset_fn):
     if self._cluster_spec:
       return values.MultiWorkerDataset(
-          partial(self._call_dataset_fn, dataset_fn), self._worker_device_map,
-          self._prefetch_on_device, self._auto_shard_dataset)
+          partial(self._call_dataset_fn, dataset_fn), self._worker_devices,
+          auto_shard=self._auto_shard_dataset)
     else:
-      return values.PerDeviceDataset(
-          self._call_dataset_fn(dataset_fn), self._devices,
-          self._prefetch_on_device)
+      return values.PerReplicaDataset(
+          self._call_dataset_fn(dataset_fn), self._devices)
 
   # TODO(priyag): Deal with OutOfRange errors once b/111349762 is fixed.
   def _run_steps_on_dataset(self, fn, iterator, iterations,
@@ -546,10 +535,10 @@ class MirroredStrategy(distribute_lib.DistributionStrategy):
     for (name, aggregation) in ctx._last_step_outputs_aggregations.items():  # pylint: disable=protected-access
       output = last_step_tensor_outputs_dict[name]
       # For outputs that have already been aggregated, wrap them in a Mirrored
-      # container, else in a PerDevice container.
+      # container, else in a PerReplica container.
       if aggregation is variables_lib.VariableAggregation.NONE:
         last_step_tensor_outputs_dict[name] = values.regroup(
-            {d: t for d, t in zip(self._devices, output)}, values.PerDevice)
+            {d: t for d, t in zip(self._devices, output)}, values.PerReplica)
       else:
         assert len(output) == 1
         last_step_tensor_outputs_dict[name] = output[0]
@@ -562,23 +551,8 @@ class MirroredStrategy(distribute_lib.DistributionStrategy):
     return self._get_cross_tower_ops().broadcast(tensor, destinations or
                                                  self._devices)
 
-  def _call_for_each_replica(self, fn, *args, **kwargs):
-    return _call_for_each_replica(self, fn, *args, **kwargs)
-
-  def map(self, map_over, fn, *args, **kwargs):
-    # TODO(josh11b): In eager mode, use one thread per device.
-    index = {}
-    for i, m in enumerate(map_over):
-      d = self._devices[i % len(self._devices)]
-      with ops.device(d):
-        l = index.get(d, [])
-        l.append(fn(m,
-                    *values.select_device_mirrored(d, args),
-                    **values.select_device_mirrored(d, kwargs)))
-        index[d] = l
-    # TODO(josh11b): Need a values.regroup equivalent that handles MapOutput
-    # in addition to PerDevice data.
-    return values.PerDevice({k: values.MapOutput(v) for k, v in index.items()})
+  def _call_for_each_replica(self, fn, args, kwargs):
+    return _call_for_each_replica(self, fn, args, kwargs)
 
   def configure(self,
                 session_config=None,
@@ -617,9 +591,10 @@ class MirroredStrategy(distribute_lib.DistributionStrategy):
   def _reduce(self, aggregation, value, destinations):
     assert not isinstance(value, values.Mirrored)
     if not isinstance(value, values.DistributedValues):
-      # This function handles reducing values that are not PerDevice or Mirrored
-      # values. For example, the same value could be present on all replicas in
-      # which case `value` would be a single value or value could be 0.
+      # This function handles reducing values that are not PerReplica or
+      # Mirrored values. For example, the same value could be present on all
+      # replicas in which case `value` would be a single value or value could
+      # be 0.
       return _reduce_non_distributed_value(self, aggregation, value,
                                            destinations)
     if aggregation == variable_scope.VariableAggregation.ONLY_FIRST_REPLICA:
@@ -818,7 +793,7 @@ class MirroredReplicaContext(distribute_lib.ReplicaContext):
   `MirroredStrategy.call_for_each_replica()`).
   """
 
-  def _merge_call(self, fn, *args, **kwargs):
+  def _merge_call(self, fn, args, kwargs):
     """Delegate to the main thread to actually perform merge_call()."""
     t = threading.current_thread()  # a _MirroredReplicaThread
     t.merge_fn = fn
@@ -837,5 +812,9 @@ class MirroredReplicaContext(distribute_lib.ReplicaContext):
 
   @property
   def device(self):
+    raise RuntimeError("Use .devices instead")
+
+  @property
+  def devices(self):
     distribute_lib.require_replica_context(self)
-    return self._distribution_strategy.worker_devices[self._replica_id]
+    return [self._distribution_strategy.worker_devices[self._replica_id]]
diff --git a/tensorflow/contrib/distribute/python/mirrored_strategy_multigpu_test.py b/tensorflow/contrib/distribute/python/mirrored_strategy_multigpu_test.py
index b8e7edaaf82..1fd18e09c01 100644
--- a/tensorflow/contrib/distribute/python/mirrored_strategy_multigpu_test.py
+++ b/tensorflow/contrib/distribute/python/mirrored_strategy_multigpu_test.py
@@ -78,11 +78,6 @@ class MirroredTwoDeviceDistributionTest(strategy_test_lib.DistributionTestBase):
     self._test_minimize_loss_graph(
         self._get_distribution_strategy(), soft_placement=soft_placement)
 
-  def testMapReduce(self):
-    if not GPU_TEST:
-      self.skipTest("Not GPU test")
-    self._test_map_reduce(self._get_distribution_strategy())
-
   def testDeviceIndex(self):
     if not GPU_TEST:
       self.skipTest("Not GPU test")
@@ -120,7 +115,7 @@ class MirroredTwoDeviceDistributionTest(strategy_test_lib.DistributionTestBase):
 
     dist = self._get_distribution_strategy()
     with dist.scope(), self.assertRaises(AssertionError):
-      dist.call_for_each_replica(run_fn, dist.worker_device_index)
+      dist.call_for_each_replica(run_fn, args=(dist.worker_device_index,))
 
   @test_util.run_in_graph_and_eager_modes
   def testReduceToCpu(self):
@@ -132,7 +127,8 @@ class MirroredTwoDeviceDistributionTest(strategy_test_lib.DistributionTestBase):
 
     dist = self._get_distribution_strategy()
     with dist.scope():
-      result = dist.call_for_each_replica(run_fn, dist.worker_device_index)
+      result = dist.call_for_each_replica(
+          run_fn, args=(dist.worker_device_index,))
       reduced = dist.reduce(
           variable_scope.VariableAggregation.SUM,
           result,
@@ -152,7 +148,8 @@ class MirroredTwoDeviceDistributionTest(strategy_test_lib.DistributionTestBase):
 
     dist = self._get_distribution_strategy()
     with dist.scope():
-      result = dist.call_for_each_replica(run_fn, dist.worker_device_index)
+      result = dist.call_for_each_replica(
+          run_fn, args=(dist.worker_device_index,))
       reduced = dist.reduce(
           variable_scope.VariableAggregation.ONLY_FIRST_REPLICA,
           result,
@@ -207,7 +204,7 @@ class MirroredStrategyVariableCreationTest(test.TestCase):
         ["/device:GPU:0", "/device:CPU:0"])
 
     with dist.scope():
-      result = dist.call_for_each_replica(model_fn, run_concurrently=False)
+      result = dist.call_for_each_replica(model_fn)
       self.assertIsInstance(result, values.MirroredVariable)
       self.assertEquals("foo:0", result.name)
 
@@ -225,7 +222,7 @@ class MirroredStrategyVariableCreationTest(test.TestCase):
         ["/device:GPU:0", "/device:CPU:0"])
 
     with dist.scope():
-      result = dist.call_for_each_replica(model_fn, run_concurrently=False)
+      result = dist.call_for_each_replica(model_fn)
       self.assertIsInstance(result, values.MirroredVariable)
       # Default name of "Variable" will be used.
       self.assertEquals("Variable:0", result.name)
@@ -246,7 +243,7 @@ class MirroredStrategyVariableCreationTest(test.TestCase):
         ["/device:GPU:0", "/device:CPU:0"])
 
     with dist.scope():
-      result = dist.call_for_each_replica(model_fn, run_concurrently=False)
+      result = dist.call_for_each_replica(model_fn)
       for i, v in enumerate(result):
         self.assertIsInstance(v, values.MirroredVariable)
         self.assertEquals("foo" + str(i) + ":0", v.name)
@@ -269,7 +266,7 @@ class MirroredStrategyVariableCreationTest(test.TestCase):
         ["/device:GPU:0", "/device:CPU:0"])
 
     with dist.scope():
-      result = dist.call_for_each_replica(model_fn, run_concurrently=False)
+      result = dist.call_for_each_replica(model_fn)
       for v in result:
         self.assertIsInstance(v, values.MirroredVariable)
       self.assertEquals(4, len(result))
@@ -293,7 +290,7 @@ class MirroredStrategyVariableCreationTest(test.TestCase):
 
     with dist.scope():
       result = dist.call_for_each_replica(
-          model_fn, dist.worker_device_index, run_concurrently=False)
+          model_fn, args=(dist.worker_device_index,))
       self.assertIsInstance(result, values.MirroredVariable)
       # The resulting mirrored variable will use the name from the first device.
       self.assertEquals("foo_0:0", result.name)
@@ -329,8 +326,7 @@ class MirroredStrategyVariableCreationTest(test.TestCase):
     features = iterator.get_next()
 
     with dist.scope():
-      result = dist.call_for_each_replica(
-          model_fn, features, run_concurrently=False)
+      result = dist.call_for_each_replica(model_fn, args=(features,))
       suffixes = ["", "_1", "_2"]
       for (kernel, bias), suffix in zip(result, suffixes):
         self.assertIsInstance(kernel, values.MirroredVariable)
@@ -368,7 +364,7 @@ class MirroredStrategyVariableCreationTest(test.TestCase):
       v = variable_scope.variable(1.0, name="var-main0")
       self.assertEquals("var-main0:0", v.name)
 
-      result = dist.call_for_each_replica(model_fn, run_concurrently=False)
+      result = dist.call_for_each_replica(model_fn)
       self.assertEquals(4, len(result))
       v0, v1, v2, v3 = result
       self.assertIsInstance(v0, values.MirroredVariable)
@@ -411,7 +407,7 @@ class MirroredStrategyVariableCreationTest(test.TestCase):
         v = variable_scope.get_variable("var-main0", [1])
         self.assertEquals("main/var-main0:0", v.name)
 
-        result = dist.call_for_each_replica(model_fn, run_concurrently=False)
+        result = dist.call_for_each_replica(model_fn)
         self.assertEquals(4, len(result))
         v0, v1, v2, v3 = result
         self.assertIsInstance(v0, values.MirroredVariable)
@@ -448,7 +444,7 @@ class MirroredStrategyVariableCreationTest(test.TestCase):
     devices = ["/device:GPU:0", "/device:CPU:0"]
     dist = mirrored_strategy.MirroredStrategy(devices)
     with dist.scope():
-      v0, v1 = dist.call_for_each_replica(create_fn, run_concurrently=False)
+      v0, v1 = dist.call_for_each_replica(create_fn)
       self.evaluate(v0.initializer)
       self.assertEqual(2.0, self.evaluate(v0.get(devices[0])))
       self.assertEqual(2.0, self.evaluate(v0.get(devices[1])))
@@ -465,7 +461,7 @@ class MirroredStrategyVariableCreationTest(test.TestCase):
         return update0, update1
 
       update0a, update1a = dist.call_for_each_replica(
-          update_member_fn, dist.worker_device_index, run_concurrently=False)
+          update_member_fn, args=(dist.worker_device_index,))
 
       # Update "sync on read" variable.
       self.evaluate(dist.group(update0a))
@@ -491,7 +487,7 @@ class MirroredStrategyVariableCreationTest(test.TestCase):
         return update0, update1
 
       update0b, update1b = dist.call_for_each_replica(
-          update_state_ops_fn, dist.worker_device_index, run_concurrently=False)
+          update_state_ops_fn, args=(dist.worker_device_index,))
       self.evaluate(dist.group(update0b))
 
       # Update "sync on read" variable.
@@ -588,7 +584,7 @@ class MirroredStrategyVariableCreationTest(test.TestCase):
         ["/device:GPU:0", "/device:GPU:1", "/device:CPU:0"])
 
     with dist.scope():
-      result = dist.call_for_each_replica(model_fn, run_concurrently=False)
+      result = dist.call_for_each_replica(model_fn)
       self.assertIsInstance(result, values.MirroredVariable)
       self.assertEquals("foo:0", result.name)
 
@@ -611,7 +607,7 @@ class MirroredStrategyVariableCreationTest(test.TestCase):
           "/device:GPU:0": "bar"
       })
       with self.assertRaises(RuntimeError):
-        _ = dist.call_for_each_replica(model_fn, names, run_concurrently=False)
+        _ = dist.call_for_each_replica(model_fn, args=(names,))
 
   @test_util.run_in_graph_and_eager_modes(config=config)
   def testReplicaLocalVariable(self):
@@ -652,7 +648,7 @@ class MirroredStrategyVariableCreationTest(test.TestCase):
       # Create "sum" and "mean" versions of ReplicaLocalVariables.
       ret_ops, ret_v_sum, ret_v_mean, regrouped_sum, regrouped_mean = (
           dist.call_for_each_replica(
-              model_fn, dist.worker_device_index, run_concurrently=False))
+              model_fn, args=(dist.worker_device_index,)))
       # Should see the same wrapping instance in all replicas.
       self.assertIs(all_v_sum[0], ret_v_sum)
       self.assertIs(all_v_mean[0], ret_v_mean)
@@ -709,7 +705,7 @@ class MirroredStrategyVariableCreationTest(test.TestCase):
 
     with context.graph_mode(), dist.scope():
       with ops.name_scope("main"):
-        result = dist.call_for_each_replica(model_fn, run_concurrently=False)
+        result = dist.call_for_each_replica(model_fn)
         self.assertEquals(2, len(result))
         for v, name in zip(result, ["a", "b"]):
           self.assertIsInstance(v, values.DistributedValues)
@@ -730,7 +726,7 @@ class MirroredStrategyVariableCreationTest(test.TestCase):
         ["/device:GPU:0", "/device:CPU:0"])
 
     with context.graph_mode(), dist.scope():
-      result = dist.call_for_each_replica(model_fn, run_concurrently=False)
+      result = dist.call_for_each_replica(model_fn)
       self.assertEquals(2, len(result))
       for v, name in zip(result, ["a", "b"]):
         self.assertIsInstance(v, values.DistributedValues)
@@ -760,7 +756,7 @@ class MirroredStrategyVariableCreationTest(test.TestCase):
     with context.graph_mode(), dist.scope():
       with ops.name_scope("main"):
         a = variable_scope.variable(1.0, name="a")
-        result = dist.call_for_each_replica(model_fn, run_concurrently=False)
+        result = dist.call_for_each_replica(model_fn)
       result_b = result[0]
       result_c = result[1]
       self.assertIsInstance(result_b, values.DistributedValues)
@@ -793,7 +789,7 @@ class MirroredStrategyVariableCreationTest(test.TestCase):
     with context.graph_mode(), dist.scope():
       with ops.name_scope("main"):
         a = variable_scope.get_variable("a", [1])
-        result = dist.call_for_each_replica(model_fn, run_concurrently=False)
+        result = dist.call_for_each_replica(model_fn)
       result_b = result[0]
       result_c = result[1]
       self.assertIsInstance(result_b, values.DistributedValues)
@@ -824,7 +820,7 @@ class MirroredStrategyVariableCreationTest(test.TestCase):
         ["/device:GPU:0", "/device:CPU:0"])
 
     with context.graph_mode(), dist.scope():
-      result = dist.call_for_each_replica(model_fn, run_concurrently=False)
+      result = dist.call_for_each_replica(model_fn)
       # Two variables are created by the RNN layer.
       self.assertEquals(2, len(result))
       for v in result:
@@ -851,7 +847,7 @@ class MirroredStrategyVariableCreationTest(test.TestCase):
         return var.assign(value)
 
       with dist.scope():
-        ret_v_sum = dist.call_for_each_replica(model_fn, run_concurrently=False)
+        ret_v_sum = dist.call_for_each_replica(model_fn)
         update_ops = dist.update(ret_v_sum, update, 5.0, grouped=False)
 
         # Initialize variables.
@@ -894,7 +890,7 @@ class MirroredVariableUpdateTest(test.TestCase):
         ["/device:GPU:0", "/device:CPU:0"])
 
     with dist.scope():
-      mirrored_var = dist.call_for_each_replica(var_fn, run_concurrently=False)
+      mirrored_var = dist.call_for_each_replica(var_fn)
       self.assertIsInstance(mirrored_var, values.MirroredVariable)
       self.evaluate(variables.global_variables_initializer())
 
@@ -908,7 +904,7 @@ class MirroredVariableUpdateTest(test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes(config=config)
   def testAssignMirroredVarReplicaContextWithSum(self):
-    # Test that we don't reduce a non-per-device value with the "sum"
+    # Test that we don't reduce a non-per-replica value with the "sum"
     # aggregation type.
     self._skip_eager_if_gpus_less_than(1)
     def var_fn():
@@ -920,7 +916,7 @@ class MirroredVariableUpdateTest(test.TestCase):
         ["/device:GPU:0", "/device:CPU:0"])
 
     with dist.scope():
-      mirrored_var = dist.call_for_each_replica(var_fn, run_concurrently=False)
+      mirrored_var = dist.call_for_each_replica(var_fn)
       self.assertIsInstance(mirrored_var, values.MirroredVariable)
       self.evaluate(variables.global_variables_initializer())
 
@@ -942,7 +938,7 @@ class MirroredVariableUpdateTest(test.TestCase):
         ["/device:GPU:0", "/device:CPU:0"])
 
     with dist.scope():
-      mirrored_var = dist.call_for_each_replica(var_fn, run_concurrently=False)
+      mirrored_var = dist.call_for_each_replica(var_fn)
       self.assertIsInstance(mirrored_var, values.MirroredVariable)
       self.evaluate(variables.global_variables_initializer())
       self.assertEquals(1.0, self.evaluate(mirrored_var))
@@ -960,7 +956,7 @@ class MirroredVariableUpdateTest(test.TestCase):
         ["/device:GPU:0", "/device:CPU:0"])
 
     with dist.scope():
-      mirrored_var = dist.call_for_each_replica(var_fn, run_concurrently=False)
+      mirrored_var = dist.call_for_each_replica(var_fn)
       self.assertIsInstance(mirrored_var, values.MirroredVariable)
       self.evaluate(variables.global_variables_initializer())
       self.assertEquals(1.0, self.evaluate(mirrored_var))
@@ -971,8 +967,7 @@ class MirroredVariableUpdateTest(test.TestCase):
             mirrored_var.dtype)
         return mirrored_var.assign(value)
 
-      self.evaluate(dist.unwrap(dist.call_for_each_replica(
-          model_fn, run_concurrently=False)))
+      self.evaluate(dist.unwrap(dist.call_for_each_replica(model_fn)))
       self.assertEquals(0.5, self.evaluate(mirrored_var))
 
   @test_util.run_in_graph_and_eager_modes(config=config)
@@ -986,7 +981,7 @@ class MirroredVariableUpdateTest(test.TestCase):
         ["/device:GPU:0", "/device:CPU:0"])
 
     with dist.scope():
-      mirrored_var = dist.call_for_each_replica(var_fn, run_concurrently=False)
+      mirrored_var = dist.call_for_each_replica(var_fn)
       self.assertIsInstance(mirrored_var, values.MirroredVariable)
       self.evaluate(variables.global_variables_initializer())
       self.assertEquals(1.0, self.evaluate(mirrored_var))
@@ -994,8 +989,7 @@ class MirroredVariableUpdateTest(test.TestCase):
       def model_fn():
         return mirrored_var.assign(5.0)
 
-      self.evaluate(dist.unwrap(dist.call_for_each_replica(
-          model_fn, run_concurrently=False)))
+      self.evaluate(dist.unwrap(dist.call_for_each_replica(model_fn)))
       self.assertEquals(5.0, self.evaluate(mirrored_var))
 
   @test_util.run_in_graph_and_eager_modes(config=config)
@@ -1008,7 +1002,7 @@ class MirroredVariableUpdateTest(test.TestCase):
         ["/device:GPU:0", "/device:CPU:0"])
 
     with dist.scope():
-      mirrored_var = dist.call_for_each_replica(var_fn, run_concurrently=False)
+      mirrored_var = dist.call_for_each_replica(var_fn)
       self.assertIsInstance(mirrored_var, values.MirroredVariable)
       self.evaluate(variables.global_variables_initializer())
       self.assertEquals(1.0, self.evaluate(mirrored_var))
@@ -1036,7 +1030,7 @@ class MirroredVariableUpdateTest(test.TestCase):
         ["/device:GPU:0", "/device:CPU:0"])
 
     with dist.scope():
-      mirrored_var = dist.call_for_each_replica(var_fn, run_concurrently=False)
+      mirrored_var = dist.call_for_each_replica(var_fn)
       self.assertIsInstance(mirrored_var, values.MirroredVariable)
       self.evaluate(variables.global_variables_initializer())
       self.assertEquals(1.0, self.evaluate(mirrored_var))
@@ -1047,8 +1041,7 @@ class MirroredVariableUpdateTest(test.TestCase):
             mirrored_var.dtype)
         return mirrored_var.assign_add(value)
 
-      self.evaluate(dist.unwrap(dist.call_for_each_replica(
-          model_fn, run_concurrently=False)))
+      self.evaluate(dist.unwrap(dist.call_for_each_replica(model_fn)))
       self.assertEquals(1.5, self.evaluate(mirrored_var))
 
   @test_util.run_in_graph_and_eager_modes(config=config)
@@ -1062,7 +1055,7 @@ class MirroredVariableUpdateTest(test.TestCase):
         ["/device:GPU:0", "/device:CPU:0"])
 
     with dist.scope():
-      mirrored_var = dist.call_for_each_replica(var_fn, run_concurrently=False)
+      mirrored_var = dist.call_for_each_replica(var_fn)
       self.assertIsInstance(mirrored_var, values.MirroredVariable)
       self.evaluate(variables.global_variables_initializer())
       self.assertEquals(1.0, self.evaluate(mirrored_var))
@@ -1070,8 +1063,7 @@ class MirroredVariableUpdateTest(test.TestCase):
       def model_fn():
         return mirrored_var.assign_add(5.0)
 
-      self.evaluate(dist.unwrap(dist.call_for_each_replica(
-          model_fn, run_concurrently=False)))
+      self.evaluate(dist.unwrap(dist.call_for_each_replica(model_fn)))
       self.assertEquals(6.0, self.evaluate(mirrored_var))
 
   @test_util.run_in_graph_and_eager_modes(config=config)
@@ -1084,7 +1076,7 @@ class MirroredVariableUpdateTest(test.TestCase):
         ["/device:GPU:0", "/device:CPU:0"])
 
     with dist.scope():
-      mirrored_var = dist.call_for_each_replica(var_fn, run_concurrently=False)
+      mirrored_var = dist.call_for_each_replica(var_fn)
       self.assertIsInstance(mirrored_var, values.MirroredVariable)
       self.evaluate(variables.global_variables_initializer())
       self.assertEquals(5.0, self.evaluate(mirrored_var))
@@ -1104,7 +1096,7 @@ class MirroredVariableUpdateTest(test.TestCase):
         ["/device:GPU:0", "/device:CPU:0"])
 
     with dist.scope():
-      mirrored_var = dist.call_for_each_replica(var_fn, run_concurrently=False)
+      mirrored_var = dist.call_for_each_replica(var_fn)
       self.assertIsInstance(mirrored_var, values.MirroredVariable)
       self.evaluate(variables.global_variables_initializer())
       self.assertEquals(5.0, self.evaluate(mirrored_var))
@@ -1115,8 +1107,7 @@ class MirroredVariableUpdateTest(test.TestCase):
             mirrored_var.dtype)
         return mirrored_var.assign_sub(value)
 
-      self.evaluate(dist.unwrap(dist.call_for_each_replica(
-          model_fn, run_concurrently=False)))
+      self.evaluate(dist.unwrap(dist.call_for_each_replica(model_fn)))
       self.assertEquals(4.5, self.evaluate(mirrored_var))
 
   @test_util.run_in_graph_and_eager_modes(config=config)
@@ -1130,7 +1121,7 @@ class MirroredVariableUpdateTest(test.TestCase):
         ["/device:GPU:0", "/device:CPU:0"])
 
     with dist.scope():
-      mirrored_var = dist.call_for_each_replica(var_fn, run_concurrently=False)
+      mirrored_var = dist.call_for_each_replica(var_fn)
       self.assertIsInstance(mirrored_var, values.MirroredVariable)
       self.evaluate(variables.global_variables_initializer())
       self.assertEquals(5.0, self.evaluate(mirrored_var))
@@ -1138,8 +1129,7 @@ class MirroredVariableUpdateTest(test.TestCase):
       def model_fn():
         return mirrored_var.assign_sub(1.0)
 
-      self.evaluate(dist.unwrap(dist.call_for_each_replica(
-          model_fn, run_concurrently=False)))
+      self.evaluate(dist.unwrap(dist.call_for_each_replica(model_fn)))
       self.assertEquals(4.0, self.evaluate(mirrored_var))
 
 
@@ -1211,8 +1201,7 @@ class ReplicaLocalVariableAssignTest(test.TestCase):
         ["/device:GPU:0", "/device:CPU:0"])
 
     with dist.scope():
-      replica_local_var = dist.call_for_each_replica(model_fn,
-                                                     run_concurrently=False)
+      replica_local_var = dist.call_for_each_replica(model_fn)
       self.assertTrue(isinstance(replica_local_var,
                                  values.ReplicaLocalVariable))
       self.evaluate(variables.global_variables_initializer())
@@ -1243,8 +1232,7 @@ class ReplicaLocalVariableAssignTest(test.TestCase):
         ["/device:GPU:0", "/device:CPU:0"])
 
     with dist.scope():
-      replica_local_var = dist.call_for_each_replica(model_fn,
-                                                     run_concurrently=False)
+      replica_local_var = dist.call_for_each_replica(model_fn)
       self.assertTrue(isinstance(replica_local_var,
                                  values.ReplicaLocalVariable))
       self.evaluate(variables.global_variables_initializer())
@@ -1307,8 +1295,7 @@ class MirroredStrategyDefunTest(test.TestCase):
       mock_model = MockModel(two_variables)
       self.evaluate(variables.global_variables_initializer())
 
-      result = dist.call_for_each_replica(model_fn, mock_model, *inputs,
-                                          run_concurrently=False)
+      result = dist.call_for_each_replica(model_fn, args=[mock_model] + inputs)
       for device in devices:
         device_result = values.select_device(device, result)
         device_expected_result = values.select_device(device, expected_result)
@@ -1320,11 +1307,10 @@ class MirroredStrategyDefunTest(test.TestCase):
         # call_for_each has one trace per device. To check that the expected set
         # of variables was accessed on each trace, we first retrieve each
         # device-specific graph function.
-        per_device_graph_functions = dist.call_for_each_replica(
-            defun.get_concrete_function,
-            mock_model, *inputs, run_concurrently=False)
+        per_replica_graph_functions = dist.call_for_each_replica(
+            defun.get_concrete_function, args=[mock_model] + inputs)
         for device in devices:
-          graph_function = per_device_graph_functions.get(device=device)
+          graph_function = per_replica_graph_functions.get(device=device)
           self.assertEqual(set(mock_model.variables),
                            set(graph_function.graph.variables))
 
@@ -1398,16 +1384,16 @@ class MirroredStrategyDefunTest(test.TestCase):
                          two_variables=True)
 
   @test_util.run_in_graph_and_eager_modes()
-  def testPassPerDevice(self):
+  def testPassPerReplica(self):
     self._skip_eager_if_gpus_less_than(1)
 
     @function.defun
     def fn1(mock_model, factor):
       return mock_model(factor)
 
-    factors = values.PerDevice({"CPU:0": 5.0, "GPU:0": 3.0})
-    expected_result = values.PerDevice({"CPU:0": 5.0 * 1.25,
-                                        "GPU:0": 3.0 * 1.25})
+    factors = values.PerReplica({"CPU:0": 5.0, "GPU:0": 3.0})
+    expected_result = values.PerReplica({"CPU:0": 5.0 * 1.25,
+                                         "GPU:0": 3.0 * 1.25})
     self._call_and_check(fn1, [factors], expected_result, [fn1])
 
   @test_util.run_in_graph_and_eager_modes()
@@ -1429,8 +1415,7 @@ class MirroredStrategyDefunTest(test.TestCase):
 
       gradients_fn = backprop.implicit_grad(loss_fn)
       gradients_fn = optimizer_lib.get_filtered_grad_fn(gradients_fn)
-      grads_and_vars = dist.call_for_each_replica(
-          gradients_fn, None, run_concurrently=False)
+      grads_and_vars = dist.call_for_each_replica(gradients_fn, args=(None,))
 
       optimizer = gradient_descent.GradientDescentOptimizer(0.25)
       update_ops = optimizer._distributed_apply(dist, grads_and_vars)  # pylint: disable=protected-access
diff --git a/tensorflow/contrib/distribute/python/mirrored_strategy_test.py b/tensorflow/contrib/distribute/python/mirrored_strategy_test.py
index 2bfe0f3e7a6..bea684e77ca 100644
--- a/tensorflow/contrib/distribute/python/mirrored_strategy_test.py
+++ b/tensorflow/contrib/distribute/python/mirrored_strategy_test.py
@@ -40,9 +40,6 @@ class MirroredOneCPUDistributionTest(strategy_test_lib.DistributionTestBase):
   def testMinimizeLossGraph(self):
     self._test_minimize_loss_graph(self._get_distribution_strategy())
 
-  def testMapReduce(self):
-    self._test_map_reduce(self._get_distribution_strategy())
-
   def testDeviceIndex(self):
     self._test_device_index(self._get_distribution_strategy())
 
@@ -83,7 +80,8 @@ class VariableCreatorStackTest(test.TestCase):
     with context.graph_mode(), \
         dist.scope(), \
         variable_scope.variable_creator_scope(main_thread_creator):
-      result = dist.call_for_each_replica(model_fn, dist.worker_device_index)
+      result = dist.call_for_each_replica(
+          model_fn, args=(dist.worker_device_index,))
       result = dist.unwrap(result)
       expected = ["main_thread:thread_0", "main_thread:thread_1"]
       self.assertEquals(expected, result)
diff --git a/tensorflow/contrib/distribute/python/moving_averages_test.py b/tensorflow/contrib/distribute/python/moving_averages_test.py
index 815644421e3..7ecc852d205 100644
--- a/tensorflow/contrib/distribute/python/moving_averages_test.py
+++ b/tensorflow/contrib/distribute/python/moving_averages_test.py
@@ -93,7 +93,8 @@ class AssignMovingAveragesTest(test.TestCase, parameterized.TestCase):
       var = variables.Variable([10.0, 11.0])
       val = constant_op.constant([1.0, 2.0])
       decay = 0.25
-      # NOTE(josh11b): We currently generate an error if val is a PerDevice value.
+      # NOTE(josh11b): We currently generate an error if val is a PerReplica
+      # value.
       assign = moving_averages.assign_moving_average(
           var, val, decay, zero_debias=False)
 
@@ -121,7 +122,8 @@ class AssignMovingAveragesTest(test.TestCase, parameterized.TestCase):
       var = variables.Variable([0.0, 0.0])
       val = array_ops.placeholder(dtypes.float32)
       decay = 0.25
-      # NOTE(josh11b): We currently generate an error if val is a PerDevice value.
+      # NOTE(josh11b): We currently generate an error if val is a PerReplica
+      # value.
       assign = moving_averages.assign_moving_average(var, val, decay)
 
       variables.global_variables_initializer().run()
diff --git a/tensorflow/contrib/distribute/python/one_device_strategy.py b/tensorflow/contrib/distribute/python/one_device_strategy.py
index 8bdf0012087..a0d8f938874 100644
--- a/tensorflow/contrib/distribute/python/one_device_strategy.py
+++ b/tensorflow/contrib/distribute/python/one_device_strategy.py
@@ -25,8 +25,6 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.training import distribute as distribute_lib
 from tensorflow.python.util import nest
 
@@ -40,10 +38,9 @@ class OneDeviceStrategy(distribute_lib.DistributionStrategy):
   # doing something that won't work with other DistributionStrategy
   # implementations?
 
-  def __init__(self, device, prefetch_on_device=None):
+  def __init__(self, device):
     super(OneDeviceStrategy, self).__init__()
     self._device = device
-    self._prefetch_on_device = prefetch_on_device
     self._default_device = device
 
   def _create_variable(self, next_creator, *args, **kwargs):
@@ -62,9 +59,8 @@ class OneDeviceStrategy(distribute_lib.DistributionStrategy):
       return next_creator(*args, **kwargs)
 
   def distribute_dataset(self, dataset_fn):
-    return values.PerDeviceDataset(
-        self._call_dataset_fn(dataset_fn), [self._device],
-        self._prefetch_on_device)
+    return values.PerReplicaDataset(
+        self._call_dataset_fn(dataset_fn), [self._device])
 
   def _broadcast(self, tensor, destinations):
     del destinations
@@ -117,29 +113,13 @@ class OneDeviceStrategy(distribute_lib.DistributionStrategy):
     ctx._set_last_step_outputs(last_step_tensor_outputs_dict)  # pylint: disable=protected-access
     return ctx
 
-  def _call_for_each_replica(self, fn, *args, **kwargs):
-    # We don't run `fn` in multiple threads in OneDeviceStrategy.
-    kwargs.pop("run_concurrently", None)
+  def _call_for_each_replica(self, fn, args, kwargs):
     with ops.device(self._device), _OneDeviceReplicaContext(self):
       return fn(*args, **kwargs)
 
-  def map(self, map_over, fn, *args, **kwargs):
-    with ops.device(self._device):
-      return values.MapOutput([fn(m, *args, **kwargs) for m in map_over])
-
   def _reduce(self, aggregation, value, destinations):
-    del destinations
-    if not isinstance(value, values.MapOutput):
-      return value
-    l = value.get()
-    assert l
-    with ops.device(self._device):
-      if aggregation == vs.VariableAggregation.SUM:
-        return math_ops.add_n(l)
-      elif aggregation == vs.VariableAggregation.MEAN:
-        return math_ops.add_n(l) / len(l)
-      else:
-        assert False
+    del aggregation, destinations
+    return value
 
   def _update(self, var, options, fn, *args, **kwargs):
     # The implementations of _update() and _update_non_slot() are identical
@@ -171,6 +151,10 @@ class OneDeviceStrategy(distribute_lib.DistributionStrategy):
   def num_replicas(self):
     return 1
 
+  @property
+  def num_replicas_in_sync(self):
+    return 1
+
   @property
   def worker_devices(self):
     return [self._device]
@@ -188,6 +172,7 @@ class OneDeviceStrategy(distribute_lib.DistributionStrategy):
 
 
 class _OneDeviceReplicaContext(distribute_lib.ReplicaContext):
+  """ReplicaContext for OneDeviceStrategy."""
 
   def __init__(self, distribution_strategy):
     distribute_lib.ReplicaContext.__init__(
@@ -195,4 +180,8 @@ class _OneDeviceReplicaContext(distribute_lib.ReplicaContext):
 
   @property
   def device(self):
-    return self._distribution_strategy.worker_devices[0]
+    raise RuntimeError("Use .devices instead")
+
+  @property
+  def devices(self):
+    return [self._distribution_strategy.worker_devices[0]]
diff --git a/tensorflow/contrib/distribute/python/one_device_strategy_test.py b/tensorflow/contrib/distribute/python/one_device_strategy_test.py
index 3fb92273924..95f4cdb7868 100644
--- a/tensorflow/contrib/distribute/python/one_device_strategy_test.py
+++ b/tensorflow/contrib/distribute/python/one_device_strategy_test.py
@@ -35,9 +35,6 @@ class OneDeviceStrategyTest(strategy_test_lib.DistributionTestBase):
   def testMinimizeLossGraph(self):
     self._test_minimize_loss_graph(self._get_distribution_strategy())
 
-  def testMapReduce(self):
-    self._test_map_reduce(self._get_distribution_strategy())
-
   def testDeviceIndex(self):
     self._test_device_index(self._get_distribution_strategy())
 
diff --git a/tensorflow/contrib/distribute/python/optimizer_v2_test.py b/tensorflow/contrib/distribute/python/optimizer_v2_test.py
index 0554f4a83bd..fa4705af7cb 100644
--- a/tensorflow/contrib/distribute/python/optimizer_v2_test.py
+++ b/tensorflow/contrib/distribute/python/optimizer_v2_test.py
@@ -51,7 +51,7 @@ class MinimizeLossOptimizerV2Test(test.TestCase, parameterized.TestCase):
       def run_step():
         return control_flow_ops.group(distribution.unwrap(
             distribution.call_for_each_replica(
-                model_fn, iterator.get_next(), run_concurrently=layer.built)))
+                model_fn, args=(iterator.get_next(),))))
 
       if not context.executing_eagerly():
         with self.cached_session() as sess:
diff --git a/tensorflow/contrib/distribute/python/parameter_server_strategy.py b/tensorflow/contrib/distribute/python/parameter_server_strategy.py
index 2aa7f1ae5d6..790b37f8601 100644
--- a/tensorflow/contrib/distribute/python/parameter_server_strategy.py
+++ b/tensorflow/contrib/distribute/python/parameter_server_strategy.py
@@ -64,7 +64,7 @@ class ParameterServerStrategy(distribute_lib.DistributionStrategy):
   Operations that occur only on the first replica (such as incrementing the
   global step), will occur on the first replica *of every worker*.
 
-  It is expected to call `call_for_each_replica(fn, *args, **kwargs)` for any
+  It is expected to call `call_for_each_replica(fn, ...)` for any
   operations which potentially can be replicated across replicas (i.e. multiple
   GPUs) even if there is only CPU or one GPU. When defining the `fn`, extra
   caution needs to be taken:
@@ -223,7 +223,7 @@ class ParameterServerStrategy(distribute_lib.DistributionStrategy):
 
   def distribute_dataset(self, dataset_fn):
     """Distributes the dataset to each local GPU."""
-    return values.PerDeviceDataset(
+    return values.PerReplicaDataset(
         self._call_dataset_fn(dataset_fn), self._compute_devices, True)
 
   def _broadcast(self, tensor, destinations):
@@ -231,10 +231,13 @@ class ParameterServerStrategy(distribute_lib.DistributionStrategy):
       destinations = self._compute_devices
     return self._cross_tower_ops.broadcast(tensor, destinations)
 
+  def _allow_variable_partition(self):
+    return not context.executing_eagerly()
+
   # TODO(yuefengz): not all ops in device_setter.STANDARD_PS_OPS will go through
   # this creator, such as "MutableHashTable".
   def _create_variable(self, next_creator, *args, **kwargs):
-    if self.num_replicas > 1:
+    if self.num_replicas_in_sync > 1:
       aggregation = kwargs.pop("aggregation", vs.VariableAggregation.NONE)
       if aggregation not in (
           vs.VariableAggregation.NONE,
@@ -288,9 +291,9 @@ class ParameterServerStrategy(distribute_lib.DistributionStrategy):
       with ops.device(self._variable_device):
         return var_creator(*args, **kwargs)
 
-  def _call_for_each_replica(self, fn, *args, **kwargs):
+  def _call_for_each_replica(self, fn, args, kwargs):
     # pylint: disable=protected-access
-    return mirrored_strategy._call_for_each_replica(self, fn, *args, **kwargs)
+    return mirrored_strategy._call_for_each_replica(self, fn, args, kwargs)
 
   def _verify_destinations_not_different_worker(self, destinations):
     if not self._cluster_spec:
@@ -336,9 +339,9 @@ class ParameterServerStrategy(distribute_lib.DistributionStrategy):
               "You cannot update variable with a Mirrored object with multiple "
               "components %r when using ParameterServerStrategy. You must "
               "specify a single value or a Mirrored with a single value." % x)
-      elif isinstance(x, values.PerDevice):
+      elif isinstance(x, values.PerReplica):
         raise ValueError(
-            "You cannot update variable with a PerDevice object %r when using "
+            "You cannot update variable with a PerReplica object %r when using "
             "ParameterServerStrategy. You must specify a single value or a "
             "Mirrored with a single value" % x)
       else:
diff --git a/tensorflow/contrib/distribute/python/parameter_server_strategy_test.py b/tensorflow/contrib/distribute/python/parameter_server_strategy_test.py
index a9f643c6ecc..81a23c89030 100644
--- a/tensorflow/contrib/distribute/python/parameter_server_strategy_test.py
+++ b/tensorflow/contrib/distribute/python/parameter_server_strategy_test.py
@@ -37,6 +37,7 @@ from tensorflow.python.layers import core
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gradients
+from tensorflow.python.ops import partitioned_variables
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
@@ -85,8 +86,7 @@ class ParameterServerStrategyTestBase(
                              config=sess_config) as sess, \
          d.scope():
 
-      # Define a variable outside the call_for_each_replica scope. This is not
-      # recommended.
+      # Define a variable outside the call_for_each_replica scope.
       n = variable_scope.get_variable('n', initializer=10.0)
       self.assertEqual(n.device, '/job:ps/task:0')
 
@@ -178,6 +178,75 @@ class ParameterServerStrategyTestBase(
         self.assertEqual(z_val, 43.0)
         self.assertEqual(f_val, 46.0)
 
+  def _test_device_assignment_distributed_enable_partitioner(
+      self, task_type, task_id, num_gpus):
+    d, _, sess_config = self._get_test_objects(task_type, task_id, num_gpus)
+    num_shards = len(d.parameter_devices)
+    partitioner = partitioned_variables.fixed_size_partitioner(num_shards)
+    with ops.Graph().as_default(), \
+         self.cached_session(target=self._default_target,
+                             config=sess_config) as sess, \
+         d.scope():
+
+      n = variable_scope.get_variable(
+          'n',
+          initializer=constant_op.constant([10.0, 20.0]),
+          aggregation=variable_scope.VariableAggregation.SUM,
+          partitioner=partitioner)
+
+      for part_id, var in enumerate(n):
+        self.assertEqual(var.device, '/job:ps/task:%d' % part_id)
+
+      def model_fn():
+        a = constant_op.constant([3.0, 5.0])
+        # The device scope is ignored for variables but not for normal ops.
+        with ops.device('/job:worker/task:0'):
+          x = variable_scope.get_variable(
+              'x',
+              initializer=constant_op.constant([10.0, 20.0]),
+              aggregation=variable_scope.VariableAggregation.SUM,
+              partitioner=partitioner)
+          x_add = x.assign_add(a, name='x_add')
+        # The variable x is on the task 1 since the device_function has been
+        # called once before the model_fn.
+        for part_id, var in enumerate(x):
+          self.assertEqual(var.device, '/job:ps/task:%d' % part_id)
+          self.assertEqual(var.device, x_add[part_id].device)
+
+        # The colocate_vars_with can override the distribution's device.
+        with d.colocate_vars_with(x_add[0]):
+          y = variable_scope.get_variable(
+              'y',
+              initializer=constant_op.constant([20.0, 10.0]),
+              aggregation=variable_scope.VariableAggregation.SUM,
+              partitioner=partitioner)
+        y_add = y.assign_add(
+            [array_ops.identity(x_add[0]),
+             array_ops.identity(x_add[1])])
+
+        for part_id, var in enumerate(y):
+          self.assertEqual(var.device, '/job:ps/task:0')
+          self.assertEqual(y_add[part_id].device, var.device)
+          self.assertEqual(var.device, x_add[0].device)
+
+        return x_add, y_add
+
+      x, y = d.call_for_each_replica(model_fn)
+
+      if context.num_gpus() >= 1:
+        variables.global_variables_initializer().run()
+        x_val, y_val = sess.run([x, y])
+        if num_gpus < 1:
+          self.assertEqual(x_val, [13.0, 25.0])
+          self.assertEqual(y_val, [33.0, 35.0])
+        else:
+          x_expect = [10.0 + 3 * num_gpus, 20.0 + 5 * num_gpus]
+          y_expect = [
+              20.0 + x_expect[0] * num_gpus, 10.0 + x_expect[1] * num_gpus
+          ]
+          self.assertEqual(x_val, x_expect)
+          self.assertEqual(y_val, y_expect)
+
   def _test_device_assignment_local(self,
                                     d,
                                     compute_device='CPU',
@@ -345,11 +414,11 @@ class ParameterServerStrategyTestBase(
       self._finish_condition.release()
 
       x_val, y_val, z_val = sess.run([x, y, z])
-      self.assertEqual(x_val, 10.0 + 1.0 * num_workers * d.num_replicas)
-      self.assertEqual(y_val, 20.0 + 1.0 * num_workers * d.num_replicas)
+      self.assertEqual(x_val, 10.0 + 1.0 * num_workers * d.num_replicas_in_sync)
+      self.assertEqual(y_val, 20.0 + 1.0 * num_workers * d.num_replicas_in_sync)
       self.assertEqual(z_val, 30.0 + 1.0 * num_workers)
-      return (x_val == 10.0 + 1.0 * num_workers * d.num_replicas and
-              y_val == 20.0 + 1.0 * num_workers * d.num_replicas and
+      return (x_val == 10.0 + 1.0 * num_workers * d.num_replicas_in_sync and
+              y_val == 20.0 + 1.0 * num_workers * d.num_replicas_in_sync and
               z_val == 30.0 + 1.0 * num_workers)
 
   def _test_minimize_loss_graph(self, task_type, task_id, num_gpus):
@@ -394,7 +463,7 @@ class ParameterServerStrategyTestBase(
       def step():
         """Perform one optimization step."""
         # Run forward & backward to get gradients, variables list.
-        g_v = d.call_for_each_replica(grad_fn, one)
+        g_v = d.call_for_each_replica(grad_fn, args=(one,))
         # Update the variables using the gradients and the update() function.
         before_list = []
         after_list = []
@@ -479,6 +548,12 @@ class ParameterServerStrategyTest(ParameterServerStrategyTestBase,
   def testDeviceAssignmentDistributed(self, num_gpus):
     self._test_device_assignment_distributed('worker', 1, num_gpus)
 
+  @combinations.generate(
+      combinations.combine(mode=['graph'], num_gpus=[0, 1, 2]))
+  def testDeviceAssignmentDistributedEnablePartitioner(self, num_gpus):
+    self._test_device_assignment_distributed_enable_partitioner(
+        'worker', 1, num_gpus)
+
   def testSimpleBetweenGraph(self):
     self._run_between_graph_clients(self._test_simple_increment,
                                     self._cluster_spec, context.num_gpus())
diff --git a/tensorflow/contrib/distribute/python/step_fn.py b/tensorflow/contrib/distribute/python/step_fn.py
index a5adaac47ce..3dc815f0371 100644
--- a/tensorflow/contrib/distribute/python/step_fn.py
+++ b/tensorflow/contrib/distribute/python/step_fn.py
@@ -90,7 +90,6 @@ class StandardSingleLossStep(StandardInputStep):
     super(StandardSingleLossStep, self).__init__(dataset_fn, distribution)
     self._loss_fn = loss_fn
     self._optimizer = optimizer
-    self._is_run_concurrently = False
     self._iterations_per_step = iterations_per_step
 
   def __call__(self):
@@ -101,14 +100,11 @@ class StandardSingleLossStep(StandardInputStep):
         gradients_fn = optimizer_lib.get_filtered_grad_fn(gradients_fn)
 
         grads_and_vars = self.distribution.call_for_each_replica(
-            gradients_fn,
-            ctx, *inputs,
-            run_concurrently=self._is_run_concurrently)
+            gradients_fn, args=(ctx,) + inputs)
         # If threads use layers, then we need to run the first step
         # sequentially, so that layers.build() is not executed in parallel.
         # Otherwise, multiple sets of mirrored variables are going to be
         # created.
-        self._is_run_concurrently = True
         return self._optimizer._distributed_apply(  # pylint: disable=protected-access
             self.distribution, grads_and_vars)
 
diff --git a/tensorflow/contrib/distribute/python/strategy_test_lib.py b/tensorflow/contrib/distribute/python/strategy_test_lib.py
index 60ef0a2106a..3c0c10430eb 100644
--- a/tensorflow/contrib/distribute/python/strategy_test_lib.py
+++ b/tensorflow/contrib/distribute/python/strategy_test_lib.py
@@ -104,7 +104,7 @@ class DistributionTestBase(test.TestCase):
       def step():
         """Perform one optimization step."""
         # Run forward & backward to get gradients, variables list.
-        g_v = d.call_for_each_replica(grad_fn, one, run_concurrently=l.built)
+        g_v = d.call_for_each_replica(grad_fn, args=(one,))
 
         # Update the variables using the gradients and the update() function.
         before_list = []
@@ -160,7 +160,7 @@ class DistributionTestBase(test.TestCase):
       def step():
         """Perform one optimization step."""
         # Run forward & backward to get gradients, variables list.
-        g_v = d.call_for_each_replica(grad_fn, one)
+        g_v = d.call_for_each_replica(grad_fn, args=(one,))
 
         # Update the variables using the gradients and the update() function.
         before_list = []
@@ -189,15 +189,6 @@ class DistributionTestBase(test.TestCase):
       # Error should go down
       self.assertLess(error_after, error_before)
 
-  def _test_map_reduce(self, d, in_graph=None):
-    with d.scope():
-      map_in = [constant_op.constant(i) for i in range(10)]
-      map_out = d.map(map_in, lambda x, y: x * y, 2)
-      observed = d.reduce(variable_scope.VariableAggregation.SUM, map_out,
-                          "/device:CPU:0")
-      expected = 90  # 2 * (0 + 1 + ... + 9)
-      self.assertEqual(expected, observed.numpy())
-
   def _test_device_index(self, d):
     with d.scope():
       expected_devices = [False] * len(d.worker_devices)
@@ -207,7 +198,7 @@ class DistributionTestBase(test.TestCase):
         self.assertFalse(expected_devices[device_id])
         expected_devices[device_id] = True
 
-      d.call_for_each_replica(mark_devices_fn, d.worker_device_index)
+      d.call_for_each_replica(mark_devices_fn, args=(d.worker_device_index,))
       self.assertAllEqual(expected_devices, [True] * len(d.worker_devices))
 
   def _test_replica_id(self, d):
diff --git a/tensorflow/contrib/distribute/python/tpu_strategy.py b/tensorflow/contrib/distribute/python/tpu_strategy.py
index 65ef21df09b..f5b4531ba8c 100644
--- a/tensorflow/contrib/distribute/python/tpu_strategy.py
+++ b/tensorflow/contrib/distribute/python/tpu_strategy.py
@@ -141,7 +141,7 @@ class TPUStrategy(distribute_lib.DistributionStrategy):
     # parallelism.
     device_map = {d.name: i for i, d in enumerate(self._tpu_metadata.devices)
                   if "device:TPU:" in d.name}
-    self._device_index = values.PerDevice(device_map)
+    self._device_index = values.PerReplica(device_map)
     self._host_device = self.get_host_cpu_device(0)
     self._tpu_devices = sorted(device_map.keys())
     # Only create variables for the number of replicas we're running.
@@ -215,12 +215,12 @@ class TPUStrategy(distribute_lib.DistributionStrategy):
     return enqueue_op_per_host
 
   def distribute_dataset(self, dataset_fn):
-    worker_map = {
-        self.get_host(hid): [self.get_host_cpu_device(hid)]
+    worker_devices = [
+        (self.get_host(hid), [self.get_host_cpu_device(hid)])
         for hid in range(self.num_hosts)
-    }
+    ]
     return values.MultiWorkerDataset(
-        functools.partial(self._call_dataset_fn, dataset_fn), worker_map)
+        functools.partial(self._call_dataset_fn, dataset_fn), worker_devices)
 
   # TODO(priyag): Deal with OutOfRange errors once b/111349762 is fixed.
   # TODO(sourabhbajaj): Remove the initial_loop_values parameter when we have
@@ -308,7 +308,8 @@ class TPUStrategy(distribute_lib.DistributionStrategy):
       # For outputs that have already been aggregated, take the first value
       # from the list as each value should be the same. Else return the full
       # list of values.
-      # TODO(josh11b): If aggregation is NONE, we should return a PerDevice value.
+      # TODO(josh11b): If aggregation is NONE, we should return a PerReplica
+      # value.
       if aggregation is not variables_lib.VariableAggregation.NONE:
         # TODO(priyag): Should this return the element or a list with 1 element
         last_step_tensor_outputs_dict[name] = output[0]
@@ -316,10 +317,9 @@ class TPUStrategy(distribute_lib.DistributionStrategy):
 
     return ctx
 
-  def _call_for_each_replica(self, fn, *args, **kwargs):
+  def _call_for_each_replica(self, fn, args, kwargs):
     # TODO(jhseu): Consider making it so call_for_each_replica implies that
     # we're in a tpu.rewrite(), and update TPUMirroredVariable accordingly.
-    kwargs.pop("run_concurrently", None)
     with _TPUReplicaContext(self):
       return fn(*args, **kwargs)
 
@@ -445,7 +445,7 @@ class TPUStrategy(distribute_lib.DistributionStrategy):
       return [val.get(device=d) for d in sorted(val.devices)]
     elif isinstance(val, list):
       # TODO(josh11b): We need to remove this case; per device values should
-      # be represented using a PerDevice wrapper instead of a list with
+      # be represented using a PerReplica wrapper instead of a list with
       # one entry per device.
       return val
     return [val]
@@ -544,5 +544,9 @@ class _TPUReplicaContext(distribute_lib.ReplicaContext):
 
   @property
   def device(self):
+    raise RuntimeError("Use .devices instead")
+
+  @property
+  def devices(self):
     distribute_lib.require_replica_context(self)
-    return self._distribution_strategy.worker_devices[self._replica_id]
+    return [self._distribution_strategy.worker_devices[self._replica_id]]
diff --git a/tensorflow/contrib/distribute/python/values.py b/tensorflow/contrib/distribute/python/values.py
index 42fb92014a0..a1629735353 100644
--- a/tensorflow/contrib/distribute/python/values.py
+++ b/tensorflow/contrib/distribute/python/values.py
@@ -51,7 +51,7 @@ from tensorflow.python.util import nest
 # TODO(josh11b): Should device values be strings or DeviceSpec objects?
 # Not sure DeviceSpec objects are usable as a dict key.
 class DistributedValues(object):
-  """Holds a map from device to values. Either PerDevice or Mirrored."""
+  """Holds a map from device to values. Either PerReplica or Mirrored."""
 
   def __init__(self, index):
     self._index = {device_util.canonicalize(key): value
@@ -62,7 +62,8 @@ class DistributedValues(object):
     if device is None:
       replica_context = distribution_strategy_context.get_replica_context()
       if replica_context:
-        device = replica_context.device
+        # TODO(josh11b): support model parallelism better here
+        device = replica_context.devices[0]
       else:
         device = distribute_lib.get_update_device()
         if device is None:
@@ -75,10 +76,6 @@ class DistributedValues(object):
           ValueError("Device %s not found in %s (current device %s)" %
                      (device, self._index.keys(), device_util.current())), e)
 
-  def on_device(self, device):
-    device = device_util.canonicalize(device)
-    return device in self._index
-
   @property
   def devices(self):
     return list(self._index.keys())
@@ -167,12 +164,12 @@ class DistributedDelegate(DistributedValues):
   # TODO(josh11b): Even more operator overloads.
 
 
-class PerDevice(DistributedValues):
+class PerReplica(DistributedValues):
   """Holds a map from device to unsynchronized values."""
   pass
 
 
-# Note that unlike PerDevice, Mirrored values inherit from
+# Note that unlike PerReplica, Mirrored values inherit from
 # DistributedDelegate and so can be used directly in cross-replica mode.
 class Mirrored(DistributedDelegate):
   """Holds a map from device to values which are kept in sync."""
@@ -482,7 +479,8 @@ class TPUMirroredVariable(checkpointable.CheckpointableBase):
     if device is None:
       replica_context = distribution_strategy_context.get_replica_context()
       if replica_context:
-        device = replica_context.device
+        # TODO(josh11b): support model parallelism better here
+        device = replica_context.devices[0]
       else:
         device = distribute_lib.get_update_device()
         if device is None:
@@ -583,7 +581,8 @@ class TPUMirroredVariable(checkpointable.CheckpointableBase):
   # update_non_slot() function (like OptimizerV2._finish), which can
   # update several non-slot variables in one call.
   def _assign_func(self, *args, **kwargs):
-    if distribution_strategy_context.get_distribution_strategy().__class__.__name__ != "TPUStrategy":
+    strategy = distribution_strategy_context.get_distribution_strategy()
+    if strategy.__class__.__name__ != "TPUStrategy":
       raise ValueError("You may only assign to a TPUMirroredVariable within a "
                        "TPUStrategy.")
     f = kwargs.pop("f")
@@ -776,6 +775,18 @@ class TPUMirroredVariable(checkpointable.CheckpointableBase):
   def op(self):
     return self._primary_var.op
 
+  # pylint: disable=protected-access
+  @property
+  def _save_slice_info(self):
+    return self._primary_var._save_slice_info
+
+  def _get_save_slice_info(self):
+    return self._primary_var._get_save_slice_info()
+
+  def _set_save_slice_info(self, save_slice_info):
+    return self._primary_var._set_save_slice_info(save_slice_info)
+  # pylint: enable=protected-access
+
   @property
   def _in_graph_mode(self):
     return self._primary_var._in_graph_mode   # pylint: disable=protected-access
@@ -861,7 +872,7 @@ def _assert_replica_context():
         "Replica-local variables may only be assigned in a replica context.")
 
 
-class ReplicaLocalVariable(DistributedVariable, PerDevice,
+class ReplicaLocalVariable(DistributedVariable, PerReplica,
                            checkpointable.CheckpointableBase):
   """Holds a map from device to variables whose values are reduced on save."""
 
@@ -942,9 +953,9 @@ def _devices_match(d1, d2):
   return device_util.canonicalize(d1) == device_util.canonicalize(d2)
 
 
-def regroup(per_device, wrap_class=PerDevice):
-  """Makes device->nest map into a nest of PerDevice/Mirrored values."""
-  items = list(per_device.items())
+def regroup(per_replica, wrap_class=PerReplica):
+  """Makes device->nest map into a nest of PerReplica/Mirrored values."""
+  items = list(per_replica.items())
   assert items
   v0 = items[0][1]  # First value
 
@@ -1005,7 +1016,7 @@ def regroup(per_device, wrap_class=PerDevice):
   # want to return the containing MirroredVariable, after a bunch of
   # sanity checking. In particular, each component should have the
   # same container, and the devices of the variables should match the
-  # keys of the per-device dictionary.
+  # keys of the per-replica dictionary.
   if hasattr(v0, "_distributed_container"):
     # pylint: disable=protected-access
     assert not isinstance(v0, MirroredVariable), (
@@ -1021,11 +1032,11 @@ def regroup(per_device, wrap_class=PerDevice):
     return distributed_container
   # pylint: enable=protected-access
 
-  return wrap_class(per_device)
+  return wrap_class(per_replica)
 
 
 def select_device(device, structured):
-  """Specialize a nest of regular & per-device values for one device."""
+  """Specialize a nest of regular & per-replica values for one device."""
   def _get(x):
     return x.get(device) if isinstance(x, DistributedValues) else x
 
@@ -1070,8 +1081,8 @@ def update_regroup(strategy, updates, should_group):
   return nest.pack_sequence_as(regrouped, grouped_flat)
 
 
-class PerDeviceDataIterator(object):
-  """An iterator (like `tf.data.Iterator`) into a `PerDeviceDataset`."""
+class PerReplicaDataIterator(object):
+  """An iterator (like `tf.data.Iterator`) into a `PerReplicaDataset`."""
 
   def __init__(self, iterator, devices, prefetch_on_device=None):
     self._iterator = iterator
@@ -1114,8 +1125,8 @@ class PerDeviceDataIterator(object):
     return self._iterator.output_types
 
 
-class PerDeviceDataset(object):
-  """Like `tf.data.Dataset` split devices, producing `PerDevice` data."""
+class PerReplicaDataset(object):
+  """Like `tf.data.Dataset` split devices, producing `PerReplica` data."""
 
   def __init__(self, dataset, devices, prefetch_on_device=None):
     self._devices = devices
@@ -1136,20 +1147,20 @@ class PerDeviceDataset(object):
       self._dataset = dataset.batch(len(devices), drop_remainder=True)
 
   def make_one_shot_iterator(self):
-    """Get a one time use iterator for the distributed PerDeviceDataset."""
+    """Get a one time use iterator for the distributed PerReplicaDataset."""
     # Graph mode with one shot iterator is disabled.
     if not context.executing_eagerly():
       raise ValueError("Cannot create a one shot iterator. Please use "
                        "`make_initializable_iterator()` instead.")
     # Eager mode prefetching would error out in constructor. Only remaining
     # case is non-prefetching in eager mode. We delegate to
-    # PerDeviceDataIterator to handle that case.
+    # PerReplicaDataIterator to handle that case.
     dataset_iterator = self._dataset.make_one_shot_iterator()
-    return PerDeviceDataIterator(
+    return PerReplicaDataIterator(
         dataset_iterator, self._devices, prefetch_on_device=False)
 
   def make_initializable_iterator(self):
-    """Get an initializable iterator for the distributed PerDeviceDataset."""
+    """Get an initializable iterator for the distributed PerReplicaDataset."""
     # Eager mode generates already initialized iterators. Hence we cannot create
     # an initializable iterator.
     if context.executing_eagerly():
@@ -1160,7 +1171,7 @@ class PerDeviceDataset(object):
           self._dataset, self._devices)
     else:
       dataset_iterator = self._dataset.make_initializable_iterator()
-    return PerDeviceDataIterator(
+    return PerReplicaDataIterator(
         dataset_iterator,
         self._devices,
         prefetch_on_device=self._prefetch_on_device)
@@ -1169,43 +1180,47 @@ class PerDeviceDataset(object):
 class MultiWorkerDataIterator(object):
   """An iterator (like `tf.data.Iterator`) into a `MultiWorkerDataset`."""
 
-  def __init__(self, iterators, worker_device_map):
+  def __init__(self, iterators, worker_device_pairs):
     """Initialize the MultiWorkerDataIterator object.
 
     Args:
-      iterators: a dict mapping from each worker to an iterator for
-        that worker.
-      worker_device_map: a dict mapping from each worker's devices to a list of
-        devices that belong to this worker.
+      iterators: a list of worker, iterator pairs.
+      worker_device_pairs: a list of (worker's devices, a list of
+        devices that belong to this worker) pairs.
 
     Raises:
-      ValueError: if iterators and worker_device_map are not compatible.
+      ValueError: if iterators and worker_device_pairs are not compatible.
     """
-    self._iterators = iterators
-    self._worker_device_map = worker_device_map
-    if set(self._iterators) != set(self._worker_device_map):
-      raise ValueError("iterators and worker_device_map are not compatible.")
+    if [d for d, _ in iterators] != [d for d, _ in worker_device_pairs]:
+      raise ValueError("iterators and worker_device_pairs are not compatible.")
+    self._workers = [d for d, _ in iterators]
+    self._iterators = [i for _, i in iterators]
+    self._worker_devices = [l for _, l in worker_device_pairs]
 
   @property
   def initializer(self):
     return control_flow_ops.group(
-        [iterator.initializer for iterator in self._iterators.values()])
+        [iterator.initializer for iterator in self._iterators])
 
   def get_iterator(self, worker):
-    return self._iterators.get(worker)
+    for i, w in enumerate(self._workers):
+      if worker == w:
+        return self._iterators[i]
+    return None
 
   @property
   def output_shapes(self):
-    return self._iterators.values()[0].output_shapes
+    return self._iterators[0].output_shapes
 
   @property
   def output_types(self):
-    return self._iterators.values()[0].output_types
+    return self._iterators[0].output_types
 
   def get_next(self, name=None):
     """Scatter the input across hosts and devices."""
     index = {}
-    for worker, iterator in six.iteritems(self._iterators):
+    worker_info = zip(self._workers, self._iterators, self._worker_devices)
+    for worker, iterator, worker_devices in worker_info:
       if name is not None:
         d = tf_device.DeviceSpec.from_string(worker)
         new_name = "%s_%s_%d" % (name, d.job, d.task)
@@ -1214,13 +1229,12 @@ class MultiWorkerDataIterator(object):
       with ops.device(worker):
         data_per_worker = iterator.get_next(name=new_name)
 
-      worker_devices = self._worker_device_map[worker]
-      # Ungroup these per-device value so as to get a flat map from devices to
+      # Ungroup these per-replica value so as to get a flat map from devices to
       # values.
       for d in worker_devices:
         v = select_device(d, data_per_worker)
         if d in index:
-          raise ValueError("Duplicated devices in worker_device_map: %r" % v)
+          raise ValueError("Duplicated devices in worker_device_pairs: %r" % v)
         index[d] = v
 
     return regroup(index)
@@ -1229,153 +1243,48 @@ class MultiWorkerDataIterator(object):
 class MultiWorkerDataset(object):
   """Like a `tf.data.Dataset` that distributes data to different workers.
 
-  Each worker gets one shard of the input dataset. It is currently not working
-  in
-  eager mode.
+  Each worker gets one shard of the input dataset. This currently does not work
+  in eager mode.
   """
 
-  def __init__(self, dataset_fn, worker_device_map, prefetch_on_device=None,
+  def __init__(self, dataset_fn, worker_device_pairs, prefetch_on_device=None,
                auto_shard=False):
     """Initialize the MultiWorkerDataset object.
 
     Args:
       dataset_fn: a function that returns a `tf.data.Dataset`.
-      worker_device_map: a dict mapping from each worker to a list of devices
-        that belong to this worker.
+      worker_device_pairs: a list of (worker, list of devices on that worker)
+        pairs.
       prefetch_on_device: whether to prefetch to devices.
       auto_shard: whether to auto-shard the dataset.
     """
-    self._worker_device_map = worker_device_map
-    self._datasets = {}
+    self._worker_device_pairs = worker_device_pairs
+    self._datasets = []
     # TODO(yuefengz, priyag): support different set of jobs for input
     # processing.
-    for i, (worker, worker_devices) in enumerate(
-        six.iteritems(worker_device_map)):
+    for i, (worker, worker_devices) in enumerate(worker_device_pairs):
       with ops.device(worker):
         worker_input = dataset_fn()
         if auto_shard:
           worker_input = input_ops.auto_shard_dataset(
-              worker_input, len(worker_device_map), i)
-        self._datasets[worker] = PerDeviceDataset(
+              worker_input, len(worker_device_pairs), i)
+        dataset = PerReplicaDataset(
             worker_input, worker_devices, prefetch_on_device=prefetch_on_device)
+        self._datasets.append((worker, dataset))
 
   def make_one_shot_iterator(self):
-    iterators = {}
-    for worker, dataset in six.iteritems(self._datasets):
+    iterators = []
+    for worker, dataset in self._datasets:
       with ops.device(worker):
-        iterators[worker] = dataset.make_one_shot_iterator()
-    return MultiWorkerDataIterator(iterators, self._worker_device_map)
+        iterators.append((worker, dataset.make_one_shot_iterator()))
+    return MultiWorkerDataIterator(iterators, self._worker_device_pairs)
 
   def make_initializable_iterator(self):
-    iterators = {}
-    for worker, dataset in six.iteritems(self._datasets):
+    iterators = []
+    for worker, dataset in self._datasets:
       with ops.device(worker):
-        iterators[worker] = dataset.make_initializable_iterator()
-    return MultiWorkerDataIterator(iterators, self._worker_device_map)
-
-
-class _PerKey(object):
-  """Holds data associated by keys."""
-
-  def __init__(self, *index):
-    # pylint: disable=protected-access
-    self._index = list(index)
-
-  def get(self, iteration):
-    return array_ops.gather(self._index, iteration)
-
-  def get_shape(self):
-    return self._index[-1][-1].get_shape()
-
-  def get_dtype(self):
-    return self._index[-1][-1].dtype
-
-  def __str__(self):
-    return "%s:%s" % (self.__class__.__name__, self._index)
-
-  def __repr__(self):
-    return "%s(%r)" % (self.__class__.__name__, self._index)
-
-
-class PerIteration(_PerKey):
-  """Holds input for multiple iterations at once."""
-
-  def __init__(self, *index):
-    # pylint: disable=protected-access
-    super(PerIteration, self).__init__(*[batch._index for batch in index])
-
-
-class Batches(_PerKey):
-  pass
-
-
-class MultiIterator(object):
-  """Iterator that returns results of multiple get_next()s."""
-
-  def __init__(self, dataset_iterator, iterations, batches_per_iteration):
-    self._dataset_iterator = dataset_iterator
-    self._iterations = iterations
-    self._batches_per_iteration = batches_per_iteration
-
-  def get_next(self, name=None):
-    """Return PerIteration with `iterations x batches_per_iteration` inputs."""
-    data = []
-    for _ in range(self._batches_per_iteration):
-      batch = []
-      for _ in range(self._iterations):
-        batch.append(self._dataset_iterator.get_next(name=name))
-      data.append(batch)
-
-    # Here is an example.  Suppose each get_next returns a tuple of two tensors.
-    # For 3 `iterations` and 2 `batches_per_iteration`, the `data` is:
-    # [[(a,z), (b,y), (c,x)], [(A,Z), (B,Y), (C,X)]]
-    #
-    # After the first `map_structure` it gets transformed to:
-    #  [(Batches(a, A), Batches(z, Z)),
-    #   (Batches(b, B), Batches(y, Y)),
-    #   (Batches(c, C), Batches(x, X))]
-    #
-    # After the second `map_structure` it gets transformed to a tuple of:
-    # (PerIteration([Batches(a, A), Batches(b, B), Batches(c, C)]),
-    #  PerIteration([Batches(z, Z), Batches(y, Y), Batches(x, X)]))
-
-    data = nest.map_structure(Batches, *data)
-    data = nest.map_structure(PerIteration, *data)
-
-    return data
-
-  @property
-  def initializer(self):
-    return self._dataset_iterator.initializer
-
-
-class PerIterationDataset(object):
-  """A dataset that returns MultiIterators."""
-
-  def __init__(self, dataset, iterations, batches_per_iteration):
-    self._dataset = dataset
-    self._iterations = iterations
-    self._batches_per_iteration = batches_per_iteration
-
-  def make_one_shot_iterator(self):
-    iterator = self._dataset.make_one_shot_iterator()
-    return MultiIterator(iterator, self._iterations,
-                         self._batches_per_iteration)
-
-  def make_initializable_iterator(self):
-    iterator = self._dataset.make_initializable_iterator()
-    return MultiIterator(iterator, self._iterations,
-                         self._batches_per_iteration)
-
-
-class MapOutput(object):
-  """Map can result in multiple outputs per device."""
-
-  def __init__(self, l):
-    self._l = l
-
-  def get(self):
-    return self._l
+        iterators.append((worker, dataset.make_initializable_iterator()))
+    return MultiWorkerDataIterator(iterators, self._worker_device_pairs)
 
 
 class MultiStepContext(object):
@@ -1430,13 +1339,13 @@ class MultiStepContext(object):
       output: The tensors that should be outputted with `name`. See below for
         actual types supported.
       aggregation: Aggregation method to use to aggregate outputs from multiple
-        replicas. Required if `set_last_step_output` is called in a replica context.
-        Optional in cross_replica_context.
+        replicas. Required if `set_last_step_output` is called in a replica
+        context. Optional in cross_replica_context.
         When present, the outputs from all the replicas are aggregated using the
         current distribution strategy's `reduce` method. Hence, the type of
         `output` must be what's supported by the corresponding `reduce` method.
         For e.g. if using MirroredStrategy and aggregation is set, output
-        must be a `PerDevice` value.
+        must be a `PerReplica` value.
         The aggregation method is also recorded in a dictionary
         `_last_step_outputs_aggregations` for later interpreting of the
         outputs as already reduced or not.
@@ -1482,7 +1391,7 @@ class MultiStepContext(object):
 
 
 def value_container(val):
-  """Returns the container that this per-device `value` belongs to.
+  """Returns the container that this per-replica `value` belongs to.
 
   Args:
     val: A value returned by `call_for_each_replica()` or a variable
@@ -1528,8 +1437,8 @@ class AggregatingVariable(checkpointable.CheckpointableBase):
         # We are calling an assign function in an update context.
         return f(self._v, *args, **kwargs)
 
-      # We are calling an assign function in cross replica context, wrap it in an
-      # update call.
+      # We are calling an assign function in cross replica context, wrap it in
+      # an update call.
       return distribution_strategy_context.get_distribution_strategy().update(
           self, f, *args, **kwargs)
     else:
diff --git a/tensorflow/contrib/distribute/python/values_test.py b/tensorflow/contrib/distribute/python/values_test.py
index d514e6f4c15..268393ee801 100644
--- a/tensorflow/contrib/distribute/python/values_test.py
+++ b/tensorflow/contrib/distribute/python/values_test.py
@@ -18,7 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import collections
 import os
 
 from tensorflow.contrib.distribute.python import mirrored_strategy
@@ -190,10 +189,10 @@ def _make_mirrored():
 
 class RegroupAndSelectDeviceTest(test.TestCase):
 
-  def _is_per_device(self, result, expected, klass=values.PerDevice):
+  def _is_per_replica(self, result, expected, klass=values.PerReplica):
     self.assertIsInstance(result, klass)
     # We canonicalize the devices to match the device strings returned
-    # by PerDevice, which also does device string canonicalization.
+    # by PerReplica, which also does device string canonicalization.
     devices = [device_util.canonicalize(_device_str(i))
                for i in range(len(expected))]
     self.assertEqual(set(devices), set(result.devices))
@@ -206,18 +205,18 @@ class RegroupAndSelectDeviceTest(test.TestCase):
                              _device_str(1): _nested_value("2")})
     self.assertIsInstance(result, tuple)
     self.assertEqual(3, len(result))
-    self._is_per_device(result[0], ["a1", "a2"])
-    self._is_per_device(result[2], ["h1", "h2"])
+    self._is_per_replica(result[0], ["a1", "a2"])
+    self._is_per_replica(result[2], ["h1", "h2"])
 
     self.assertIsInstance(result[1], list)
     self.assertEqual(3, len(result[1]))
-    self._is_per_device(result[1][0], ["b1", "b2"])
-    self._is_per_device(result[1][2], ["g1", "g2"])
+    self._is_per_replica(result[1][0], ["b1", "b2"])
+    self._is_per_replica(result[1][2], ["g1", "g2"])
 
     self.assertIsInstance(result[1][1], dict)
     self.assertEqual(set(["c", "e"]), set(result[1][1].keys()))
-    self._is_per_device(result[1][1]["c"], ["d1", "d2"])
-    self._is_per_device(result[1][1]["e"], ["f1", "f2"])
+    self._is_per_replica(result[1][1]["c"], ["d1", "d2"])
+    self._is_per_replica(result[1][1]["e"], ["f1", "f2"])
 
     # Also test that we can undo the merge using select_device()
     self.assertEqual(_nested_value("1"),
@@ -238,18 +237,18 @@ class RegroupAndSelectDeviceTest(test.TestCase):
                             values.Mirrored)
     self.assertIsInstance(result, tuple)
     self.assertEqual(3, len(result))
-    self._is_per_device(result[0], ["a1", "a2"], values.Mirrored)
-    self._is_per_device(result[2], ["h1", "h2"], values.Mirrored)
+    self._is_per_replica(result[0], ["a1", "a2"], values.Mirrored)
+    self._is_per_replica(result[2], ["h1", "h2"], values.Mirrored)
 
     self.assertIsInstance(result[1], list)
     self.assertEqual(3, len(result[1]))
-    self._is_per_device(result[1][0], ["b1", "b2"], values.Mirrored)
-    self._is_per_device(result[1][2], ["g1", "g2"], values.Mirrored)
+    self._is_per_replica(result[1][0], ["b1", "b2"], values.Mirrored)
+    self._is_per_replica(result[1][2], ["g1", "g2"], values.Mirrored)
 
     self.assertIsInstance(result[1][1], dict)
     self.assertEqual(set(["c", "e"]), set(result[1][1].keys()))
-    self._is_per_device(result[1][1]["c"], ["d1", "d2"], values.Mirrored)
-    self._is_per_device(result[1][1]["e"], ["f1", "f2"], values.Mirrored)
+    self._is_per_replica(result[1][1]["c"], ["d1", "d2"], values.Mirrored)
+    self._is_per_replica(result[1][1]["e"], ["f1", "f2"], values.Mirrored)
 
     # Also test that we can undo the merge using select_device()
     self.assertEqual(_nested_value("1"),
@@ -275,7 +274,7 @@ class RegroupAndSelectDeviceTest(test.TestCase):
                              _device_str(1): ("b", foo)})
     self.assertIsInstance(result, tuple)
     self.assertEqual(2, len(result))
-    self._is_per_device(result[0], ["a", "b"])
+    self._is_per_replica(result[0], ["a", "b"])
     self.assertIs(foo, result[1])
 
     # Test select_device(), should undo the merge done by regroup().
@@ -341,53 +340,30 @@ class RegroupAndSelectDeviceTest(test.TestCase):
                                                merged_estimator_spec))
 
 
-class PerDeviceDatasetTest(test.TestCase):
+class PerReplicaDatasetTest(test.TestCase):
 
   config = config_pb2.ConfigProto()
   config.allow_soft_placement = True
 
-  def _test_iterator_no_prefetch(self, devices, dataset, expected_values):
-    per_device_dataset = values.PerDeviceDataset(
-        dataset, devices, prefetch_on_device=False)
+  def _test_iterator(self, devices, dataset, expected_values):
+    per_replica_dataset = values.PerReplicaDataset(dataset, devices)
     if context.executing_eagerly():
-      iterator = per_device_dataset.make_one_shot_iterator()
+      iterator = per_replica_dataset.make_one_shot_iterator()
     else:
-      iterator = per_device_dataset.make_initializable_iterator()
+      iterator = per_replica_dataset.make_initializable_iterator()
       self.evaluate([iterator.initializer])
 
     for expected_value in expected_values:
       next_element = iterator.get_next()
-      actual = self.evaluate([
-          values.select_device(d, next_element) for d in devices])
-      self.assertEqual(expected_value, actual)
+      computed_value = self.evaluate(
+          [values.select_device(d, next_element) for d in devices])
+      self.assertEqual(expected_value, computed_value)
 
     with self.assertRaises(errors.OutOfRangeError):
       next_element = iterator.get_next()
       self.evaluate([
           values.select_device(d, next_element) for d in devices])
 
-  def _test_iterator_with_prefetch(self, devices, dataset, expected_values):
-    if not context.executing_eagerly():
-      per_device_dataset = values.PerDeviceDataset(
-          dataset, devices, prefetch_on_device=True)
-      iterator = per_device_dataset.make_initializable_iterator()
-      self.evaluate([iterator.initializer])
-
-      for expected_value in expected_values:
-        next_element = iterator.get_next()
-        computed_value = self.evaluate(
-            [values.select_device(d, next_element) for d in devices])
-        self.assertEqual(expected_value, computed_value)
-
-      with self.assertRaises(errors.OutOfRangeError):
-        next_element = iterator.get_next()
-        self.evaluate([
-            values.select_device(d, next_element) for d in devices])
-
-  def _test_iterator(self, devices, dataset, expected_values):
-    self._test_iterator_no_prefetch(devices, dataset, expected_values)
-    self._test_iterator_with_prefetch(devices, dataset, expected_values)
-
   @test_util.run_in_graph_and_eager_modes
   def testOneDevice(self):
     devices = ["/device:CPU:0"]
@@ -442,9 +418,8 @@ class PerDeviceDatasetTest(test.TestCase):
       dataset = dataset_ops.Dataset.from_tensor_slices(
           random_ops.random_uniform((10,)))
 
-      per_device_dataset = values.PerDeviceDataset(
-          dataset, devices, prefetch_on_device=False)
-      iterator = per_device_dataset.make_initializable_iterator()
+      per_replica_dataset = values.PerReplicaDataset(dataset, devices)
+      iterator = per_replica_dataset.make_initializable_iterator()
 
       self.evaluate(iterator.initializer)
       next_element = iterator.get_next()
@@ -463,7 +438,7 @@ class PerDeviceDatasetTest(test.TestCase):
 
 class MultiWorkerDatasetTest(multi_worker_test_base.MultiWorkerTestBase):
 
-  def _test_iterator(self, iterator, devices, expected_values):
+  def _test_iterator(self, sess, iterator, devices, expected_values):
     next_element = iterator.get_next()
     for device in devices:
       v = values.select_device(device, next_element)
@@ -472,73 +447,79 @@ class MultiWorkerDatasetTest(multi_worker_test_base.MultiWorkerTestBase):
         self.assertTrue(element.device in device)
 
     for expected_value in expected_values:
-      actual = self.evaluate(
+      actual = sess.run(
           [values.select_device(d, next_element) for d in devices])
       self.assertEqual(expected_value, actual)
 
     with self.assertRaises(errors.OutOfRangeError):
-      self.evaluate([values.select_device(d, next_element) for d in devices])
+      sess.run([values.select_device(d, next_element) for d in devices])
 
-  def _test_dataset(self, dataset_fn, worker_device_map, devices,
-                    expected_values):
+  def _test_dataset(self, dataset_fn, worker_devices, devices,
+                    expected_values, auto_shard=True):
     multi_worker_dataset = values.MultiWorkerDataset(
-        dataset_fn, worker_device_map, prefetch_on_device=False)
-    multi_worker_iterator = multi_worker_dataset.make_one_shot_iterator()
-    self._test_iterator(multi_worker_iterator, devices, expected_values)
+        dataset_fn, worker_devices, auto_shard=auto_shard)
+    multi_worker_iterator = multi_worker_dataset.make_initializable_iterator()
+    with self.cached_session() as sess:
+      sess.run(multi_worker_iterator.initializer)
+      self._test_iterator(sess, multi_worker_iterator, devices, expected_values)
 
   def _cpu_devices(self):
-    worker_device_map = collections.OrderedDict(
-        [("/job:worker/replica:0/task:0",
-          ["/job:worker/replica:0/task:0/device:CPU:0"]),
-         ("/job:worker/replica:0/task:1",
-          ["/job:worker/replica:0/task:1/device:CPU:0"])])
+    worker_devices = [
+        ("/job:worker/replica:0/task:0",
+         ["/job:worker/replica:0/task:0/device:CPU:0"]),
+        ("/job:worker/replica:0/task:1",
+         ["/job:worker/replica:0/task:1/device:CPU:0"])]
     devices = [
         "/job:worker/replica:0/task:0/device:CPU:0",
         "/job:worker/replica:0/task:1/device:CPU:0"
     ]
-    return worker_device_map, devices
+    return worker_devices, devices
 
   def _cpu_and_one_gpu_devices(self):
-    # The worker_device_map doesn't have to be a OrderDict object, this is just
-    # to simplify the testing so that we can pass expected values as a list
-    # instead of a dict.
-    worker_device_map = collections.OrderedDict(
-        [("/job:worker/replica:0/task:0", [
+    worker_devices = [
+        ("/job:worker/replica:0/task:0", [
             "/job:worker/replica:0/task:0/device:GPU:0",
             "/job:worker/replica:0/task:0/device:CPU:0"
-        ]), ("/job:worker/replica:0/task:1", [
+        ]),
+        ("/job:worker/replica:0/task:1", [
             "/job:worker/replica:0/task:1/device:GPU:0",
             "/job:worker/replica:0/task:1/device:CPU:0"
-        ])])
+        ])
+    ]
     devices = [
         "/job:worker/replica:0/task:0/device:GPU:0",
         "/job:worker/replica:0/task:0/device:CPU:0",
         "/job:worker/replica:0/task:1/device:GPU:0",
         "/job:worker/replica:0/task:1/device:CPU:0"
     ]
-    return worker_device_map, devices
+    return worker_devices, devices
 
   def testDataDistributionOneDevicePerWorker(self):
-    self.skipTest("Temporarily disabled.")
-    worker_device_map, devices = self._cpu_devices()
+    worker_devices, devices = self._cpu_devices()
     with context.graph_mode():
       dataset_fn = lambda: dataset_ops.Dataset.range(8)
-      self._test_dataset(dataset_fn, worker_device_map, devices,
+      self._test_dataset(dataset_fn, worker_devices, devices,
                          [[0, 1], [2, 3], [4, 5], [6, 7]])
 
+  def testDataDistributionNoAutoShard(self):
+    worker_devices, devices = self._cpu_devices()
+    with context.graph_mode():
+      dataset_fn = lambda: dataset_ops.Dataset.range(4)
+      self._test_dataset(dataset_fn, worker_devices, devices,
+                         [[0, 0], [1, 1], [2, 2], [3, 3]],
+                         auto_shard=False)
+
   def testDataDistributionTwoDevicePerWorker(self):
-    self.skipTest("Temporarily disabled.")
     if context.num_gpus() < 1:
       self.skipTest("A GPU is not available for this test.")
-    worker_device_map, devices = self._cpu_and_one_gpu_devices()
+    worker_devices, devices = self._cpu_and_one_gpu_devices()
     with context.graph_mode():
       dataset_fn = lambda: dataset_ops.Dataset.range(8)
-      self._test_dataset(dataset_fn, worker_device_map, devices,
+      self._test_dataset(dataset_fn, worker_devices, devices,
                          [[0, 2, 1, 3], [4, 6, 5, 7]])
 
   def testTupleDataset(self):
-    self.skipTest("Temporarily disabled.")
-    worker_device_map, devices = self._cpu_devices()
+    worker_devices, devices = self._cpu_devices()
 
     with context.graph_mode():
 
@@ -550,41 +531,38 @@ class MultiWorkerDatasetTest(multi_worker_test_base.MultiWorkerTestBase):
       expected_values = [
           [(i, i**2), (i + 1, (i + 1)**2)] for i in range(0, 8, 2)
       ]
-      self._test_dataset(dataset_fn, worker_device_map, devices,
+      self._test_dataset(dataset_fn, worker_devices, devices,
                          expected_values)
 
   def testInitializableIterator(self):
-    self.skipTest("Temporarily disabled.")
-    worker_device_map, devices = self._cpu_devices()
-    with context.graph_mode():
+    worker_devices, devices = self._cpu_devices()
+    with context.graph_mode(), self.cached_session() as sess:
       dataset_fn = lambda: dataset_ops.Dataset.range(8)
       multi_worker_dataset = values.MultiWorkerDataset(
-          dataset_fn, worker_device_map, prefetch_on_device=False)
+          dataset_fn, worker_devices, auto_shard=True)
       multi_worker_iterator = multi_worker_dataset.make_initializable_iterator()
 
-      self.evaluate(multi_worker_iterator.initializer)
-      self._test_iterator(multi_worker_iterator, devices,
+      sess.run(multi_worker_iterator.initializer)
+      self._test_iterator(sess, multi_worker_iterator, devices,
                           [[0, 1], [2, 3], [4, 5], [6, 7]])
 
       # After re-initializing the iterator, should be able to iterate again.
-      self.evaluate(multi_worker_iterator.initializer)
-      self._test_iterator(multi_worker_iterator, devices,
+      sess.run(multi_worker_iterator.initializer)
+      self._test_iterator(sess, multi_worker_iterator, devices,
                           [[0, 1], [2, 3], [4, 5], [6, 7]])
 
   def testValueErrorForIterator(self):
-    self.skipTest("Temporarily disabled.")
     # Incompatiable arguments.
     with self.assertRaises(ValueError):
       values.MultiWorkerDataIterator({"w1": None}, {"w1": "d1", "w2": "d2"})
 
     # Test duplicated devices under same worker.
-    worker_device_map, _ = self._cpu_devices()
-    worker_device_map["/job:worker/replica:0/task:0"].append(
-        "/job:worker/replica:0/task:0/device:CPU:0")
+    worker_devices, _ = self._cpu_devices()
+    worker_devices[0][1].append("/job:worker/replica:0/task:0/device:CPU:0")
     with context.graph_mode():
       dataset_fn = lambda: dataset_ops.Dataset.range(8)
       multi_worker_dataset = values.MultiWorkerDataset(
-          dataset_fn, worker_device_map, prefetch_on_device=False)
+          dataset_fn, worker_devices, auto_shard=True)
       multi_worker_iterator = multi_worker_dataset.make_initializable_iterator()
       with self.assertRaises(ValueError):
         multi_worker_iterator.get_next()
diff --git a/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_v2.py b/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_v2.py
index 6e775afb69a..67ffb939663 100644
--- a/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_v2.py
+++ b/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_v2.py
@@ -544,4 +544,19 @@ class SequenceNumericColumn(
     return fc.SequenceDenseColumn.TensorSequenceLengthPair(
         dense_tensor=dense_tensor, sequence_length=seq_length)
 
+  # TODO(b/119409767): Implement parents, _{get,from}_config.
+  @property
+  def parents(self):
+    """See 'FeatureColumn` base class."""
+    raise NotImplementedError()
+
+  def _get_config(self):
+    """See 'FeatureColumn` base class."""
+    raise NotImplementedError()
+
+  @classmethod
+  def _from_config(cls, config, custom_objects=None, columns_by_name=None):
+    """See 'FeatureColumn` base class."""
+    raise NotImplementedError()
+
 # pylint: enable=protected-access
diff --git a/tensorflow/contrib/framework/python/framework/experimental_test.py b/tensorflow/contrib/framework/python/framework/experimental_test.py
index cfdc7df7d8f..00e04b83ac4 100644
--- a/tensorflow/contrib/framework/python/framework/experimental_test.py
+++ b/tensorflow/contrib/framework/python/framework/experimental_test.py
@@ -44,17 +44,18 @@ class ExperimentalTest(test.TestCase):
 
     # Assert function docs are properly updated.
     self.assertEqual("_fn", _fn.__name__)
-    self.assertEqual("fn doc. (experimental)"
-                     "\n"
-                     "\nTHIS FUNCTION IS EXPERIMENTAL. It may change or "
-                     "be removed at any time, and without warning."
-                     "\n"
-                     "\nArgs:"
-                     "\n  arg0: Arg 0."
-                     "\n  arg1: Arg 1."
-                     "\n"
-                     "\nReturns:"
-                     "\n  Sum of args.", _fn.__doc__)
+    self.assertEqual(
+        "fn doc. (experimental)"
+        "\n"
+        "\nWarning: THIS FUNCTION IS EXPERIMENTAL. It may change "
+        "or be removed at any time, and without warning."
+        "\n"
+        "\nArgs:"
+        "\n  arg0: Arg 0."
+        "\n  arg1: Arg 1."
+        "\n"
+        "\nReturns:"
+        "\n  Sum of args.", _fn.__doc__)
 
     # Assert calling new fn issues log warning.
     self.assertEqual(3, _fn(1, 2))
diff --git a/tensorflow/contrib/gdr/gdr_server_lib.cc b/tensorflow/contrib/gdr/gdr_server_lib.cc
index d8584e4e6b7..b3f48ec1dd9 100644
--- a/tensorflow/contrib/gdr/gdr_server_lib.cc
+++ b/tensorflow/contrib/gdr/gdr_server_lib.cc
@@ -52,9 +52,10 @@ Status GdrServer::Init() {
       [this](const WorkerEnv* env) {
         return new GdrRendezvousMgr(env, remote_memory_manager_.get());
       };
-  WorkerCreationFunction worker_func = [this](WorkerEnv* env) {
+  WorkerCreationFunction worker_func = [this](WorkerEnv* env,
+                                              const ConfigProto& config) {
     return std::unique_ptr<GdrWorker>(
-        new GdrWorker(env, remote_memory_manager_.get()));
+        new GdrWorker(env, config, remote_memory_manager_.get()));
   };
 
   TF_RETURN_IF_ERROR(remote_memory_manager_->Init());
diff --git a/tensorflow/contrib/gdr/gdr_worker.cc b/tensorflow/contrib/gdr/gdr_worker.cc
index ce1d8d2d730..867cb83f420 100644
--- a/tensorflow/contrib/gdr/gdr_worker.cc
+++ b/tensorflow/contrib/gdr/gdr_worker.cc
@@ -39,9 +39,9 @@ limitations under the License.
 
 namespace tensorflow {
 
-GdrWorker::GdrWorker(WorkerEnv* worker_env,
+GdrWorker::GdrWorker(WorkerEnv* worker_env, const ConfigProto& config,
                      RemoteMemoryManager* remote_memory_manager)
-    : GrpcWorker(worker_env),
+    : GrpcWorker(worker_env, config),
       remote_memory_manager_(remote_memory_manager),
       recv_tensor_recent_request_ids_(100000) {}
 
diff --git a/tensorflow/contrib/gdr/gdr_worker.h b/tensorflow/contrib/gdr/gdr_worker.h
index 65105ed9973..39f11e6bde5 100644
--- a/tensorflow/contrib/gdr/gdr_worker.h
+++ b/tensorflow/contrib/gdr/gdr_worker.h
@@ -25,7 +25,8 @@ namespace tensorflow {
 
 class GdrWorker : public GrpcWorker {
  public:
-  GdrWorker(WorkerEnv* env, RemoteMemoryManager* remote_memory_manager);
+  GdrWorker(WorkerEnv* env, const ConfigProto& config,
+            RemoteMemoryManager* remote_memory_manager);
 
   // Serve the RecvTensorRequest but omit the tensor content and transmit it
   // out-of-band using GPU Direct RDMA whenever possible.
diff --git a/tensorflow/contrib/ignite/BUILD b/tensorflow/contrib/ignite/BUILD
index 9393b702d11..2698b83a56a 100644
--- a/tensorflow/contrib/ignite/BUILD
+++ b/tensorflow/contrib/ignite/BUILD
@@ -22,48 +22,92 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":dataset_ops",
+        ":igfs_ops",
     ],
 )
 
 tf_custom_op_library(
-    name = "_dataset_ops.so",
-    srcs = ["ops/dataset_ops.cc"],
-    deps = [":dataset_kernels"],
+    name = "_ignite_ops.so",
+    srcs = [
+        "kernels/igfs/igfs.h",
+        "ops/dataset_ops.cc",
+        "ops/igfs_ops.cc",
+    ],
+    deps = [
+        ":dataset_kernels",
+        ":igfs_kernels",
+    ],
 )
 
 tf_gen_op_libs(
     op_lib_names = ["dataset_ops"],
 )
 
+tf_gen_op_libs(
+    op_lib_names = ["igfs_ops"],
+    deps = [":igfs_kernels"],
+)
+
 cc_library(
-    name = "dataset_kernels",
+    name = "ignite_client",
     srcs = [
-        "kernels/ignite_dataset_ops.cc",
-        "kernels/ignite_client.h",
-        "kernels/ignite_byte_swapper.h",
-        "kernels/ignite_plain_client.h",
-        "kernels/ignite_ssl_wrapper.h",
-        "kernels/ignite_ssl_wrapper.cc",
-        "kernels/ignite_binary_object_parser.h",
-        "kernels/ignite_binary_object_parser.cc",
-        "kernels/ignite_dataset.h",
-        "kernels/ignite_dataset.cc",
-        "kernels/ignite_dataset_iterator.h",
-        "kernels/ignite_dataset_iterator.cc",
+        "kernels/client/ignite_client.h",
+        "kernels/client/ignite_byte_swapper.h",
+        "kernels/client/ignite_plain_client.h",
+        "kernels/client/ignite_ssl_wrapper.h",
+        "kernels/client/ignite_ssl_wrapper.cc",
     ] + if_not_windows([
-        "kernels/ignite_plain_client_unix.cc",
+        "kernels/client/ignite_plain_client_unix.cc",
     ]) + if_windows([
-        "kernels/ignite_plain_client_windows.cc",
+        "kernels/client/ignite_plain_client_windows.cc",
     ]),
     copts = if_windows([
         "-DWIN32_LEAN_AND_MEAN",
     ]),
     deps = [
         "//tensorflow/core:framework_headers_lib",
-        "//third_party/eigen3",
         "@boringssl//:ssl",
         "@protobuf_archive//:protobuf_headers",
     ],
+)
+
+cc_library(
+    name = "dataset_kernels",
+    srcs = [
+        "kernels/dataset/ignite_binary_object_parser.cc",
+        "kernels/dataset/ignite_binary_object_parser.h",
+        "kernels/dataset/ignite_dataset.cc",
+        "kernels/dataset/ignite_dataset.h",
+        "kernels/dataset/ignite_dataset_iterator.cc",
+        "kernels/dataset/ignite_dataset_iterator.h",
+        "kernels/dataset/ignite_dataset_ops.cc",
+    ],
+    deps = [
+        ":ignite_client",
+        "//tensorflow/core:framework_headers_lib",
+        "//third_party/eigen3",
+        "@protobuf_archive//:protobuf_headers",
+    ],
+    alwayslink = 1,
+)
+
+cc_library(
+    name = "igfs_kernels",
+    srcs = [
+        "kernels/igfs/igfs.cc",
+        "kernels/igfs/igfs.h",
+        "kernels/igfs/igfs_client.cc",
+        "kernels/igfs/igfs_client.h",
+        "kernels/igfs/igfs_extended_tcp_client.cc",
+        "kernels/igfs/igfs_extended_tcp_client.h",
+        "kernels/igfs/igfs_messages.cc",
+        "kernels/igfs/igfs_messages.h",
+        "kernels/igfs/igfs_random_access_file.cc",
+        "kernels/igfs/igfs_random_access_file.h",
+        "kernels/igfs/igfs_writable_file.cc",
+        "kernels/igfs/igfs_writable_file.h",
+    ],
+    deps = [":ignite_client"],
     alwayslink = 1,
 )
 
@@ -82,10 +126,29 @@ py_library(
     ],
 )
 
+py_library(
+    name = "igfs_ops",
+    srcs = [
+        "python/ops/igfs_ops.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":igfs_op_loader",
+        "//tensorflow/python:util",
+        "//tensorflow/python/data/util:nest",
+    ],
+)
+
 tf_gen_op_wrapper_py(
     name = "gen_dataset_ops",
     out = "python/ops/gen_dataset_ops.py",
-    deps = ["//tensorflow/contrib/ignite:dataset_ops_op_lib"],
+    deps = [":dataset_ops_op_lib"],
+)
+
+tf_gen_op_wrapper_py(
+    name = "gen_igfs_ops",
+    out = "python/ops/gen_igfs_ops.py",
+    deps = [":igfs_ops_op_lib"],
 )
 
 tf_kernel_library(
@@ -97,13 +160,22 @@ tf_kernel_library(
     alwayslink = 1,
 )
 
+tf_kernel_library(
+    name = "igfs_ops_kernels",
+    deps = [
+        ":igfs_kernels",
+        "//tensorflow/core:framework",
+    ],
+    alwayslink = 1,
+)
+
 tf_custom_op_py_library(
     name = "ignite_op_loader",
     srcs = ["python/ops/ignite_op_loader.py"],
-    dso = ["//tensorflow/contrib/ignite:_dataset_ops.so"],
+    dso = [":_ignite_ops.so"],
     kernels = [
         ":dataset_ops_kernels",
-        "//tensorflow/contrib/ignite:dataset_ops_op_lib",
+        ":dataset_ops_op_lib",
     ],
     srcs_version = "PY2AND3",
     deps = [
@@ -113,6 +185,22 @@ tf_custom_op_py_library(
     ],
 )
 
+tf_custom_op_py_library(
+    name = "igfs_op_loader",
+    srcs = ["python/ops/igfs_op_loader.py"],
+    dso = [":_ignite_ops.so"],
+    kernels = [
+        ":igfs_ops_kernels",
+        ":igfs_ops_op_lib",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":gen_igfs_ops",
+        "//tensorflow/contrib/util:util_py",
+        "//tensorflow/python:platform",
+    ],
+)
+
 # The Apache Ignite servers have to setup before the test and tear down
 # after the test manually. The docker engine has to be installed.
 #
@@ -122,8 +210,11 @@ tf_custom_op_py_library(
 # To tear down Apache Ignite servers:
 # $ bash ./python/tests/stop_ignite.sh
 tf_py_test(
-    name = "ignite_dataset_test",
-    srcs = ["python/tests/ignite_dataset_test.py"],
+    name = "ignite_test",
+    srcs = [
+        "python/tests/igfs_test.py",
+        "python/tests/ignite_dataset_test.py",
+    ],
     additional_deps = [
         ":ignite",
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/contrib/ignite/README.md b/tensorflow/contrib/ignite/README.md
index 55c89d27996..c7db0b77e25 100644
--- a/tensorflow/contrib/ignite/README.md
+++ b/tensorflow/contrib/ignite/README.md
@@ -1,19 +1,32 @@
-# Ignite Dataset
+# Apache Ignite Integration
 
-- [Overview](#overview)
-- [Features](#features)
-  * [Distributed In-Memory Datasource](#distributed-in-memory-datasource)
-  * [Structured Objects](#structured-objects)
-  * [Distributed Training](#distributed-training)
-  * [SSL Connection](#ssl-connection)
-  * [Windows Support](#windows-support)
-- [Try it out](#try-it-out)
-- [Limitations](#limitations)
+-   [Overview](#overview)
+-   [Features](#features)
+    *   [Distributed In-Memory Datasource](#distributed-in-memory-datasource)
+    *   [Structured Objects](#structured-objects)
+    *   [Distributed Training](#distributed-training)
+    *   [Distributed File System](#distributed-file-system)
+    *   [SSL Connection](#ssl-connection)
+    *   [Windows Support](#windows-support)
+-   [Try it out](#try-it-out)
+    *   [Ignite Dataset](#ignite-dataset)
+    *   [IGFS](#igfs)
+-   [Limitations](#limitations)
 
 ## Overview
 
-[Apache Ignite](https://ignite.apache.org/) is a memory-centric distributed database, caching, and processing platform for
-transactional, analytical, and streaming workloads, delivering in-memory speeds at petabyte scale. This contrib package contains an integration between Apache Ignite and TensorFlow. The integration is based on [tf.data](https://www.tensorflow.org/api_docs/python/tf/data) from TensorFlow side and [Binary Client Protocol](https://apacheignite.readme.io/v2.6/docs/binary-client-protocol) from Apache Ignite side. It allows to use Apache Ignite as a data source for neural network training, inference and all other computations supported by TensorFlow. 
+[Apache Ignite](https://ignite.apache.org/) is a memory-centric distributed
+database, caching, and processing platform for transactional, analytical, and
+streaming workloads, delivering in-memory speeds at petabyte scale. This contrib
+package contains an integration between Apache Ignite and TensorFlow. The
+integration is based on
+[tf.data](https://www.tensorflow.org/api_docs/python/tf/data) from TensorFlow
+side and
+[Binary Client Protocol](https://apacheignite.readme.io/v2.6/docs/binary-client-protocol)
+from Apache Ignite side. It allows to use Apache Ignite as a data source for
+neural network training, inference and all other computations supported by
+TensorFlow. Another part of this module is an integration with distributed file
+system based on Apache Ignite.
 
 ## Features
 
@@ -134,6 +147,23 @@ Ignite Dataset allows using these two aspects of distributed neural network trai
 
 High-level TensorFlow API for [distributed training](https://www.tensorflow.org/api_docs/python/tf/contrib/distribute/DistributionStrategy) is supported as well. 
 
+### Distributed File System
+
+In addition to database functionality Apache Ignite provides a distributed file
+system called [IGFS](https://ignite.apache.org/features/igfs.html). IGFS
+delivers a similar functionality to Hadoop HDFS, but only in-memory. In fact, in
+addition to its own APIs, IGFS implements Hadoop FileSystem API and can be
+transparently plugged into Hadoop or Spark deployments. This contrib package
+contains an integration between IGFS and TensorFlow. The integration is based
+on [custom filesystem plugin](https://www.tensorflow.org/extend/add_filesys)
+from TensorFlow side and
+[IGFS Native API](https://ignite.apache.org/features/igfs.html) from Apache
+Ignite side. It has numerous uses, for example: * Checkpoints of state can be
+saved to IGFS for reliability and fault-tolerance. * Training processes
+communicate with TensorBoard by writing event files to a directory, which
+TensorBoard watches. IGFS allows this communication to work even when
+TensorBoard runs in a different process or machine.
+
 ### SSL Connection
 
 Apache Ignite allows to protect data transfer channels by [SSL](https://en.wikipedia.org/wiki/Transport_Layer_Security) and authentification. Ignite Dataset supports both SSL connection with and without authntication. For more information, please refer to the [Apache Ignite SSL/TLS](https://apacheignite.readme.io/docs/ssltls) documentation.
@@ -141,9 +171,12 @@ Apache Ignite allows to protect data transfer channels by [SSL](https://en.wikip
 ```python
 >>> import tensorflow as tf
 >>> from tensorflow.contrib.ignite import IgniteDataset
->>> 
->>> dataset = IgniteDataset(cache_name="IMAGES", certfile="client.pem", cert_password="password", username="ignite", password="ignite")
->>> ...
+>>>
+>>> dataset = IgniteDataset(cache_name="IMAGES",
+                            certfile="client.pem",
+                            cert_password="password",
+                            username="ignite",
+                            password="ignite")
 ```
 
 ### Windows Support
@@ -152,7 +185,16 @@ Ignite Dataset is fully compatible with Windows. You can use it as part of Tenso
 
 ## Try it out
 
-The simplest way to try Ignite Dataset is to run a [Docker](https://www.docker.com/) container with Apache Ignite and loaded [MNIST](http://yann.lecun.com/exdb/mnist/) data and after start interruct with it using Ignite Dataset. Such container is available on Docker Hub: [dmitrievanthony/ignite-with-mnist](https://hub.docker.com/r/dmitrievanthony/ignite-with-mnist/). You need to start this container on your machine:
+Following examples will help you to easily start working with this module.
+
+### Ignite Dataset
+
+The simplest way to try Ignite Dataset is to run a
+[Docker](https://www.docker.com/) container with Apache Ignite and loaded
+[MNIST](http://yann.lecun.com/exdb/mnist/) data and after start interruct with
+it using Ignite Dataset. Such container is available on Docker Hub:
+[dmitrievanthony/ignite-with-mnist](https://hub.docker.com/r/dmitrievanthony/ignite-with-mnist/).
+You need to start this container on your machine:
 
 ```
 docker run -it -p 10800:10800 dmitrievanthony/ignite-with-mnist
@@ -162,6 +204,35 @@ After that you will be able to work with it following way:
 
 ![ignite-dataset-mnist](https://s3.amazonaws.com/helloworld23423423ew23/ignite-dataset-mnist.png "Ignite Dataset Mnist")
 
+### IGFS
+
+The simplest way to try IGFS with TensorFlow is to run
+[Docker](https://www.docker.com/) container with Apache Ignite and enabled IGFS
+and then interruct with it using TensorFlow
+[tf.gfile](https://www.tensorflow.org/api_docs/python/tf/gfile). Such container
+is available on Docker Hub:
+[dmitrievanthony/ignite-with-igfs](https://hub.docker.com/r/dmitrievanthony/ignite-with-igfs/).
+You need to start this container on your machine:
+
+```
+docker run -it -p 10500:10500 dmitrievanthony/ignite-with-igfs
+```
+
+After that you will be able to work with it following way:
+
+```python
+>>> import tensorflow as tf
+>>> import tensorflow.contrib.ignite.python.ops.igfs_ops
+>>>
+>>> with tf.gfile.Open("igfs:///hello.txt", mode='w') as w:
+>>>   w.write("Hello, world!")
+>>>
+>>> with tf.gfile.Open("igfs:///hello.txt", mode='r') as r:
+>>>   print(r.read())
+
+Hello, world!
+```
+
 ## Limitations
 
 Presently, Ignite Dataset works with assumption that all objects in the cache have the same structure (homogeneous objects) and the cache contains at least one object. Another limitation concerns structured objects, Ignite Dataset does not support UUID, Maps and Object arrays that might be parts of an object structure.
diff --git a/tensorflow/contrib/ignite/kernels/ignite_byte_swapper.h b/tensorflow/contrib/ignite/kernels/client/ignite_byte_swapper.h
similarity index 67%
rename from tensorflow/contrib/ignite/kernels/ignite_byte_swapper.h
rename to tensorflow/contrib/ignite/kernels/client/ignite_byte_swapper.h
index 46df3e39dc4..aac950fcc2a 100644
--- a/tensorflow/contrib/ignite/kernels/ignite_byte_swapper.h
+++ b/tensorflow/contrib/ignite/kernels/client/ignite_byte_swapper.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGNITE_BYTE_SWAPPER_H_
-#define TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGNITE_BYTE_SWAPPER_H_
+#ifndef TENSORFLOW_CONTRIB_IGNITE_KERNELS_CLIENT_IGNITE_BYTE_SWAPPER_H_
+#define TENSORFLOW_CONTRIB_IGNITE_KERNELS_CLIENT_IGNITE_BYTE_SWAPPER_H_
 
 #include <stdint.h>
 #include "tensorflow/core/platform/byte_order.h"
@@ -25,76 +25,75 @@ class ByteSwapper {
  public:
   ByteSwapper(bool big_endian) { swap_ = big_endian == port::kLittleEndian; }
 
-  inline void SwapIfRequiredInt16(int16_t *x) const {
+  void SwapIfRequiredInt16(int16_t *x) const {
     if (swap_) {
       Swap16(x);
     }
   }
 
-  inline void SwapIfRequiredUnsignedInt16(uint16_t *x) const {
+  void SwapIfRequiredUnsignedInt16(uint16_t *x) const {
     if (swap_) {
       Swap16(reinterpret_cast<int16_t *>(x));
     }
   }
 
-  inline void SwapIfRequiredInt32(int32_t *x) const {
+  void SwapIfRequiredInt32(int32_t *x) const {
     if (swap_) {
       Swap32(x);
     }
   }
 
-  inline void SwapIfRequiredFloat(float *x) const {
+  void SwapIfRequiredFloat(float *x) const {
     if (swap_) {
       Swap32(reinterpret_cast<int32_t *>(x));
     }
   }
 
-  inline void SwapIfRequiredInt64(int64_t *x) const {
+  void SwapIfRequiredInt64(int64_t *x) const {
     if (swap_) {
       Swap64(x);
     }
   }
 
-  inline void SwapIfRequiredDouble(double *x) const {
+  void SwapIfRequiredDouble(double *x) const {
     if (swap_) {
       Swap64(reinterpret_cast<int64_t *>(x));
     }
   }
 
-  inline void SwapIfRequiredInt16Arr(int16_t *x, int32_t length) const {
+  void SwapIfRequiredInt16Arr(int16_t *x, int32_t length) const {
     if (swap_) {
       for (int32_t i = 0; i < length; i++) Swap16(&x[i]);
     }
   }
 
-  inline void SwapIfRequiredUnsignedInt16Arr(uint16_t *x,
-                                             int32_t length) const {
+  void SwapIfRequiredUnsignedInt16Arr(uint16_t *x, int32_t length) const {
     if (swap_) {
       for (int32_t i = 0; i < length; i++)
         Swap16(reinterpret_cast<int16_t *>(&x[i]));
     }
   }
 
-  inline void SwapIfRequiredInt32Arr(int32_t *x, int32_t length) const {
+  void SwapIfRequiredInt32Arr(int32_t *x, int32_t length) const {
     if (swap_) {
       for (int32_t i = 0; i < length; i++) Swap32(&x[i]);
     }
   }
 
-  inline void SwapIfRequiredFloatArr(float *x, int32_t length) const {
+  void SwapIfRequiredFloatArr(float *x, int32_t length) const {
     if (swap_) {
       for (int32_t i = 0; i < length; i++)
         Swap32(reinterpret_cast<int32_t *>(&x[i]));
     }
   }
 
-  inline void SwapIfRequiredInt64Arr(int64_t *x, int32_t length) const {
+  void SwapIfRequiredInt64Arr(int64_t *x, int32_t length) const {
     if (swap_) {
       for (int32_t i = 0; i < length; i++) Swap64(&x[i]);
     }
   }
 
-  inline void SwapIfRequiredDoubleArr(double *x, int32_t length) const {
+  void SwapIfRequiredDoubleArr(double *x, int32_t length) const {
     if (swap_) {
       for (int32_t i = 0; i < length; i++)
         Swap64(reinterpret_cast<int64_t *>(&x[i]));
@@ -102,16 +101,16 @@ class ByteSwapper {
   }
 
  private:
-  inline void Swap16(int16_t *x) const {
+  void Swap16(int16_t *x) const {
     *x = ((*x & 0xFF) << 8) | ((*x >> 8) & 0xFF);
   }
 
-  inline void Swap32(int32_t *x) const {
+  void Swap32(int32_t *x) const {
     *x = ((*x & 0xFF) << 24) | (((*x >> 8) & 0xFF) << 16) |
          (((*x >> 16) & 0xFF) << 8) | ((*x >> 24) & 0xFF);
   }
 
-  inline void Swap64(int64_t *x) const {
+  void Swap64(int64_t *x) const {
     *x = ((*x & 0xFF) << 56) | (((*x >> 8) & 0xFF) << 48) |
          (((*x >> 16) & 0xFF) << 40) | (((*x >> 24) & 0xFF) << 32) |
          (((*x >> 32) & 0xFF) << 24) | (((*x >> 40) & 0xFF) << 16) |
@@ -123,4 +122,4 @@ class ByteSwapper {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGNITE_BYTE_SWAPPER_H_
+#endif  // TENSORFLOW_CONTRIB_IGNITE_KERNELS_CLIENT_IGNITE_BYTE_SWAPPER_H_
diff --git a/tensorflow/contrib/ignite/kernels/ignite_client.h b/tensorflow/contrib/ignite/kernels/client/ignite_client.h
similarity index 74%
rename from tensorflow/contrib/ignite/kernels/ignite_client.h
rename to tensorflow/contrib/ignite/kernels/client/ignite_client.h
index 459b50b48fd..0da80769260 100644
--- a/tensorflow/contrib/ignite/kernels/ignite_client.h
+++ b/tensorflow/contrib/ignite/kernels/client/ignite_client.h
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGNITE_CLIENT_H_
-#define TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGNITE_CLIENT_H_
+#ifndef TENSORFLOW_CONTRIB_IGNITE_KERNELS_CLIENT_IGNITE_CLIENT_H_
+#define TENSORFLOW_CONTRIB_IGNITE_KERNELS_CLIENT_IGNITE_CLIENT_H_
 
-#include "tensorflow/contrib/ignite/kernels/ignite_byte_swapper.h"
+#include "tensorflow/contrib/ignite/kernels/client/ignite_byte_swapper.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
 
@@ -32,44 +32,44 @@ class Client {
   virtual Status ReadData(uint8_t *buf, const int32_t length) = 0;
   virtual Status WriteData(const uint8_t *buf, const int32_t length) = 0;
 
-  inline Status ReadByte(uint8_t *data) { return ReadData(data, 1); }
+  Status ReadByte(uint8_t *data) { return ReadData(data, 1); }
 
-  inline Status ReadShort(int16_t *data) {
+  Status ReadShort(int16_t *data) {
     TF_RETURN_IF_ERROR(ReadData((uint8_t *)data, 2));
     byte_swapper_.SwapIfRequiredInt16(data);
 
     return Status::OK();
   }
 
-  inline Status ReadInt(int32_t *data) {
+  Status ReadInt(int32_t *data) {
     TF_RETURN_IF_ERROR(ReadData((uint8_t *)data, 4));
     byte_swapper_.SwapIfRequiredInt32(data);
 
     return Status::OK();
   }
 
-  inline Status ReadLong(int64_t *data) {
+  Status ReadLong(int64_t *data) {
     TF_RETURN_IF_ERROR(ReadData((uint8_t *)data, 8));
     byte_swapper_.SwapIfRequiredInt64(data);
 
     return Status::OK();
   }
 
-  inline Status WriteByte(const uint8_t data) { return WriteData(&data, 1); }
+  Status WriteByte(const uint8_t data) { return WriteData(&data, 1); }
 
-  inline Status WriteShort(const int16_t data) {
+  Status WriteShort(const int16_t data) {
     int16_t tmp = data;
     byte_swapper_.SwapIfRequiredInt16(&tmp);
     return WriteData((uint8_t *)&tmp, 2);
   }
 
-  inline Status WriteInt(const int32_t data) {
+  Status WriteInt(const int32_t data) {
     int32_t tmp = data;
     byte_swapper_.SwapIfRequiredInt32(&tmp);
     return WriteData((uint8_t *)&tmp, 4);
   }
 
-  inline Status WriteLong(const int64_t data) {
+  Status WriteLong(const int64_t data) {
     int64_t tmp = data;
     byte_swapper_.SwapIfRequiredInt64(&tmp);
     return WriteData((uint8_t *)&tmp, 8);
@@ -81,4 +81,4 @@ class Client {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGNITE_CLIENT_H_
+#endif  // TENSORFLOW_CONTRIB_IGNITE_KERNELS_CLIENT_IGNITE_CLIENT_H_
diff --git a/tensorflow/contrib/ignite/kernels/ignite_plain_client.h b/tensorflow/contrib/ignite/kernels/client/ignite_plain_client.h
similarity index 80%
rename from tensorflow/contrib/ignite/kernels/ignite_plain_client.h
rename to tensorflow/contrib/ignite/kernels/client/ignite_plain_client.h
index 75424c19ee4..54658324604 100644
--- a/tensorflow/contrib/ignite/kernels/ignite_plain_client.h
+++ b/tensorflow/contrib/ignite/kernels/client/ignite_plain_client.h
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGNITE_PLAIN_CLIENT_H_
-#define TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGNITE_PLAIN_CLIENT_H_
+#ifndef TENSORFLOW_CONTRIB_IGNITE_KERNELS_CLIENT_IGNITE_PLAIN_CLIENT_H_
+#define TENSORFLOW_CONTRIB_IGNITE_KERNELS_CLIENT_IGNITE_PLAIN_CLIENT_H_
 
-#include "tensorflow/contrib/ignite/kernels/ignite_client.h"
+#include "tensorflow/contrib/ignite/kernels/client/ignite_client.h"
 
 namespace tensorflow {
 
@@ -40,4 +40,4 @@ class PlainClient : public Client {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGNITE_PLAIN_CLIENT_H_
+#endif  // TENSORFLOW_CONTRIB_IGNITE_KERNELS_CLIENT_IGNITE_PLAIN_CLIENT_H_
diff --git a/tensorflow/contrib/ignite/kernels/ignite_plain_client_unix.cc b/tensorflow/contrib/ignite/kernels/client/ignite_plain_client_unix.cc
similarity index 97%
rename from tensorflow/contrib/ignite/kernels/ignite_plain_client_unix.cc
rename to tensorflow/contrib/ignite/kernels/client/ignite_plain_client_unix.cc
index cf672942c61..54efb5b6176 100644
--- a/tensorflow/contrib/ignite/kernels/ignite_plain_client_unix.cc
+++ b/tensorflow/contrib/ignite/kernels/client/ignite_plain_client_unix.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/ignite/kernels/ignite_plain_client.h"
+#include "tensorflow/contrib/ignite/kernels/client/ignite_plain_client.h"
 
 #include <arpa/inet.h>
 #include <netdb.h>
diff --git a/tensorflow/contrib/ignite/kernels/ignite_plain_client_windows.cc b/tensorflow/contrib/ignite/kernels/client/ignite_plain_client_windows.cc
similarity index 98%
rename from tensorflow/contrib/ignite/kernels/ignite_plain_client_windows.cc
rename to tensorflow/contrib/ignite/kernels/client/ignite_plain_client_windows.cc
index dad5aace5fa..a99a3ada558 100644
--- a/tensorflow/contrib/ignite/kernels/ignite_plain_client_windows.cc
+++ b/tensorflow/contrib/ignite/kernels/client/ignite_plain_client_windows.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/ignite/kernels/ignite_plain_client.h"
+#include "tensorflow/contrib/ignite/kernels/client/ignite_plain_client.h"
 
 #define WIN32_LEAN_AND_MEAN
 #include <windows.h>
diff --git a/tensorflow/contrib/ignite/kernels/ignite_ssl_wrapper.cc b/tensorflow/contrib/ignite/kernels/client/ignite_ssl_wrapper.cc
similarity index 98%
rename from tensorflow/contrib/ignite/kernels/ignite_ssl_wrapper.cc
rename to tensorflow/contrib/ignite/kernels/client/ignite_ssl_wrapper.cc
index ceb479b0846..8f09c24a3be 100644
--- a/tensorflow/contrib/ignite/kernels/ignite_ssl_wrapper.cc
+++ b/tensorflow/contrib/ignite/kernels/client/ignite_ssl_wrapper.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/ignite/kernels/ignite_ssl_wrapper.h"
+#include "tensorflow/contrib/ignite/kernels/client/ignite_ssl_wrapper.h"
 
 #include <openssl/err.h>
 #include <openssl/ssl.h>
diff --git a/tensorflow/contrib/ignite/kernels/ignite_ssl_wrapper.h b/tensorflow/contrib/ignite/kernels/client/ignite_ssl_wrapper.h
similarity index 82%
rename from tensorflow/contrib/ignite/kernels/ignite_ssl_wrapper.h
rename to tensorflow/contrib/ignite/kernels/client/ignite_ssl_wrapper.h
index 0406644bbaa..543e03d1efc 100644
--- a/tensorflow/contrib/ignite/kernels/ignite_ssl_wrapper.h
+++ b/tensorflow/contrib/ignite/kernels/client/ignite_ssl_wrapper.h
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGNITE_SSL_WRAPPER_H_
-#define TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGNITE_SSL_WRAPPER_H_
+#ifndef TENSORFLOW_CONTRIB_IGNITE_KERNELS_CLIENT_IGNITE_SSL_WRAPPER_H_
+#define TENSORFLOW_CONTRIB_IGNITE_KERNELS_CLIENT_IGNITE_SSL_WRAPPER_H_
 
-#include "tensorflow/contrib/ignite/kernels/ignite_client.h"
+#include "tensorflow/contrib/ignite/kernels/client/ignite_client.h"
 
 #include <openssl/ssl.h>
 
@@ -48,4 +48,4 @@ class SslWrapper : public Client {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGNITE_SSL_WRAPPER_H_
+#endif  // TENSORFLOW_CONTRIB_IGNITE_KERNELS_CLIENT_IGNITE_SSL_WRAPPER_H_
diff --git a/tensorflow/contrib/ignite/kernels/ignite_binary_object_parser.cc b/tensorflow/contrib/ignite/kernels/dataset/ignite_binary_object_parser.cc
similarity index 99%
rename from tensorflow/contrib/ignite/kernels/ignite_binary_object_parser.cc
rename to tensorflow/contrib/ignite/kernels/dataset/ignite_binary_object_parser.cc
index 2c8a7d44b07..4218ec05f2c 100644
--- a/tensorflow/contrib/ignite/kernels/ignite_binary_object_parser.cc
+++ b/tensorflow/contrib/ignite/kernels/dataset/ignite_binary_object_parser.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/ignite/kernels/ignite_binary_object_parser.h"
+#include "tensorflow/contrib/ignite/kernels/dataset/ignite_binary_object_parser.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/lib/core/errors.h"
 
diff --git a/tensorflow/contrib/ignite/kernels/ignite_binary_object_parser.h b/tensorflow/contrib/ignite/kernels/dataset/ignite_binary_object_parser.h
similarity index 87%
rename from tensorflow/contrib/ignite/kernels/ignite_binary_object_parser.h
rename to tensorflow/contrib/ignite/kernels/dataset/ignite_binary_object_parser.h
index eb1f856643a..3e8a1a19623 100644
--- a/tensorflow/contrib/ignite/kernels/ignite_binary_object_parser.h
+++ b/tensorflow/contrib/ignite/kernels/dataset/ignite_binary_object_parser.h
@@ -13,11 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGNITE_BINARY_OBJECT_PARSER_H_
-#define TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGNITE_BINARY_OBJECT_PARSER_H_
+#ifndef TENSORFLOW_CONTRIB_IGNITE_KERNELS_DATASET_IGNITE_BINARY_OBJECT_PARSER_H_
+#define TENSORFLOW_CONTRIB_IGNITE_KERNELS_DATASET_IGNITE_BINARY_OBJECT_PARSER_H_
 
 #include <vector>
-#include "tensorflow/contrib/ignite/kernels/ignite_byte_swapper.h"
+#include "tensorflow/contrib/ignite/kernels/client/ignite_byte_swapper.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/lib/core/status.h"
 
@@ -78,4 +78,4 @@ enum ObjectType {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGNITE_BINARY_OBJECT_PARSER_H_
+#endif  // TENSORFLOW_CONTRIB_IGNITE_KERNELS_DATASET_IGNITE_BINARY_OBJECT_PARSER_H_
diff --git a/tensorflow/contrib/ignite/kernels/ignite_dataset.cc b/tensorflow/contrib/ignite/kernels/dataset/ignite_dataset.cc
similarity index 97%
rename from tensorflow/contrib/ignite/kernels/ignite_dataset.cc
rename to tensorflow/contrib/ignite/kernels/dataset/ignite_dataset.cc
index c4a7d3c513a..ace96e7b09f 100644
--- a/tensorflow/contrib/ignite/kernels/ignite_dataset.cc
+++ b/tensorflow/contrib/ignite/kernels/dataset/ignite_dataset.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/ignite/kernels/ignite_dataset_iterator.h"
+#include "tensorflow/contrib/ignite/kernels/dataset/ignite_dataset_iterator.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace tensorflow {
diff --git a/tensorflow/contrib/ignite/kernels/ignite_dataset.h b/tensorflow/contrib/ignite/kernels/dataset/ignite_dataset.h
similarity index 91%
rename from tensorflow/contrib/ignite/kernels/ignite_dataset.h
rename to tensorflow/contrib/ignite/kernels/dataset/ignite_dataset.h
index 66bfdf2e2a1..db3bafb11f2 100644
--- a/tensorflow/contrib/ignite/kernels/ignite_dataset.h
+++ b/tensorflow/contrib/ignite/kernels/dataset/ignite_dataset.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGNITE_DATASET_H_
-#define TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGNITE_DATASET_H_
+#ifndef TENSORFLOW_CONTRIB_IGNITE_KERNELS_DATASET_IGNITE_DATASET_H_
+#define TENSORFLOW_CONTRIB_IGNITE_KERNELS_DATASET_IGNITE_DATASET_H_
 
 #include "tensorflow/core/framework/dataset.h"
 
@@ -60,4 +60,4 @@ class IgniteDataset : public DatasetBase {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGNITE_DATASET_H_
+#endif  // TENSORFLOW_CONTRIB_IGNITE_KERNELS_DATASET_IGNITE_DATASET_H_
diff --git a/tensorflow/contrib/ignite/kernels/ignite_dataset_iterator.cc b/tensorflow/contrib/ignite/kernels/dataset/ignite_dataset_iterator.cc
similarity index 98%
rename from tensorflow/contrib/ignite/kernels/ignite_dataset_iterator.cc
rename to tensorflow/contrib/ignite/kernels/dataset/ignite_dataset_iterator.cc
index 5da9127aa6a..ce8972f1e7f 100644
--- a/tensorflow/contrib/ignite/kernels/ignite_dataset_iterator.cc
+++ b/tensorflow/contrib/ignite/kernels/dataset/ignite_dataset_iterator.cc
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/ignite/kernels/ignite_dataset_iterator.h"
+#include "tensorflow/contrib/ignite/kernels/dataset/ignite_dataset_iterator.h"
 
-#include "tensorflow/contrib/ignite/kernels/ignite_plain_client.h"
-#include "tensorflow/contrib/ignite/kernels/ignite_ssl_wrapper.h"
+#include "tensorflow/contrib/ignite/kernels/client/ignite_plain_client.h"
+#include "tensorflow/contrib/ignite/kernels/client/ignite_ssl_wrapper.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/platform/logging.h"
 
diff --git a/tensorflow/contrib/ignite/kernels/ignite_dataset_iterator.h b/tensorflow/contrib/ignite/kernels/dataset/ignite_dataset_iterator.h
similarity index 87%
rename from tensorflow/contrib/ignite/kernels/ignite_dataset_iterator.h
rename to tensorflow/contrib/ignite/kernels/dataset/ignite_dataset_iterator.h
index c499e2c9ccf..5868c2cb67f 100644
--- a/tensorflow/contrib/ignite/kernels/ignite_dataset_iterator.h
+++ b/tensorflow/contrib/ignite/kernels/dataset/ignite_dataset_iterator.h
@@ -13,12 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGNITE_DATASET_ITERATOR_H_
-#define TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGNITE_DATASET_ITERATOR_H_
+#ifndef TENSORFLOW_CONTRIB_IGNITE_KERNELS_DATASET_IGNITE_DATASET_ITERATOR_H_
+#define TENSORFLOW_CONTRIB_IGNITE_KERNELS_DATASET_IGNITE_DATASET_ITERATOR_H_
 
-#include "tensorflow/contrib/ignite/kernels/ignite_binary_object_parser.h"
-#include "tensorflow/contrib/ignite/kernels/ignite_client.h"
-#include "tensorflow/contrib/ignite/kernels/ignite_dataset.h"
+#include "tensorflow/contrib/ignite/kernels/client/ignite_client.h"
+#include "tensorflow/contrib/ignite/kernels/dataset/ignite_binary_object_parser.h"
+#include "tensorflow/contrib/ignite/kernels/dataset/ignite_dataset.h"
 #include "tensorflow/core/platform/mutex.h"
 
 namespace tensorflow {
@@ -96,4 +96,4 @@ constexpr int32_t kMinResLength = 12;
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGNITE_DATASET_ITERATOR_H_
+#endif  // TENSORFLOW_CONTRIB_IGNITE_KERNELS_DATASET_IGNITE_DATASET_ITERATOR_H_
diff --git a/tensorflow/contrib/ignite/kernels/ignite_dataset_ops.cc b/tensorflow/contrib/ignite/kernels/dataset/ignite_dataset_ops.cc
similarity index 97%
rename from tensorflow/contrib/ignite/kernels/ignite_dataset_ops.cc
rename to tensorflow/contrib/ignite/kernels/dataset/ignite_dataset_ops.cc
index f75b1c5ff55..f2108775e29 100644
--- a/tensorflow/contrib/ignite/kernels/ignite_dataset_ops.cc
+++ b/tensorflow/contrib/ignite/kernels/dataset/ignite_dataset_ops.cc
@@ -15,8 +15,8 @@ limitations under the License.
 
 #include <stdlib.h>
 
-#include "tensorflow/contrib/ignite/kernels/ignite_binary_object_parser.h"
-#include "tensorflow/contrib/ignite/kernels/ignite_dataset.h"
+#include "tensorflow/contrib/ignite/kernels/dataset/ignite_binary_object_parser.h"
+#include "tensorflow/contrib/ignite/kernels/dataset/ignite_dataset.h"
 #include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/lib/strings/numbers.h"
 
diff --git a/tensorflow/contrib/ignite/kernels/igfs/igfs.cc b/tensorflow/contrib/ignite/kernels/igfs/igfs.cc
new file mode 100644
index 00000000000..ae2dbcc2cf5
--- /dev/null
+++ b/tensorflow/contrib/ignite/kernels/igfs/igfs.cc
@@ -0,0 +1,331 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/file_system.h"
+#include "tensorflow/core/platform/file_system_helper.h"
+
+#include "tensorflow/contrib/ignite/kernels/igfs/igfs.h"
+#include "tensorflow/contrib/ignite/kernels/igfs/igfs_client.h"
+#include "tensorflow/contrib/ignite/kernels/igfs/igfs_random_access_file.h"
+#include "tensorflow/contrib/ignite/kernels/igfs/igfs_writable_file.h"
+
+namespace tensorflow {
+
+static string GetEnvOrElse(const string &env, string default_value) {
+  const char *env_c_str = env.c_str();
+  return getenv(env_c_str) != nullptr ? getenv(env_c_str) : default_value;
+}
+
+static string MakeRelative(const string &a, const string &b) {
+  string max = a;
+  string min = b;
+  bool first = b.size() > a.size();
+
+  if (first) {
+    max = b;
+    min = a;
+  }
+
+  auto r = mismatch(min.begin(), min.end(), max.begin());
+  return string((first ? r.first : r.second), first ? min.end() : max.end());
+}
+
+string IGFS::TranslateName(const string &name) const {
+  StringPiece scheme, namenode, path;
+  io::ParseURI(name, &scheme, &namenode, &path);
+  return string(path.data(), path.length());
+}
+
+IGFS::IGFS()
+    : host_(GetEnvOrElse("IGFS_HOST", "localhost")),
+      port_([] {
+        int port;
+        if (strings::safe_strto32(GetEnvOrElse("IGFS_PORT", "10500").c_str(),
+                                  &port)) {
+          return port;
+        } else {
+          LOG(WARNING)
+              << "IGFS_PORT environment variable had an invalid value: "
+              << getenv("IGFS_PORT") << "\nUsing default port 10500.";
+          return 10500;
+        }
+      }()),
+      fs_name_(GetEnvOrElse("IGFS_FS_NAME", "default_fs")) {
+  LOG(INFO) << "IGFS created [host=" << host_ << ", port=" << port_
+            << ", fs_name=" << fs_name_ << "]";
+}
+
+IGFS::~IGFS() {
+  LOG(INFO) << "IGFS destroyed [host=" << host_ << ", port=" << port_
+            << ", fs_name=" << fs_name_ << "]";
+}
+
+Status IGFS::NewRandomAccessFile(const string &file_name,
+                                 std::unique_ptr<RandomAccessFile> *result) {
+  std::unique_ptr<IGFSClient> client = CreateClient();
+  string path = TranslateName(file_name);
+
+  CtrlResponse<HandshakeResponse> handshake_response(true);
+  TF_RETURN_IF_ERROR(client->Handshake(&handshake_response));
+
+  CtrlResponse<OpenReadResponse> open_read_response(true);
+  TF_RETURN_IF_ERROR(client->OpenRead(&open_read_response, path));
+
+  int64 resource_id = open_read_response.res.stream_id;
+  result->reset(new IGFSRandomAccessFile(path, resource_id, std::move(client)));
+
+  LOG(INFO) << "New random access file completed successfully [file_name="
+            << file_name << "]";
+
+  return Status::OK();
+}
+
+Status IGFS::NewWritableFile(const string &file_name,
+                             std::unique_ptr<WritableFile> *result) {
+  std::unique_ptr<IGFSClient> client = CreateClient();
+  string path = TranslateName(file_name);
+
+  CtrlResponse<HandshakeResponse> handshake_response(true);
+  TF_RETURN_IF_ERROR(client->Handshake(&handshake_response));
+
+  CtrlResponse<ExistsResponse> exists_response(false);
+  TF_RETURN_IF_ERROR(client->Exists(&exists_response, path));
+
+  if (exists_response.res.exists) {
+    CtrlResponse<DeleteResponse> del_response(false);
+    TF_RETURN_IF_ERROR(client->Delete(&del_response, path, false));
+  }
+
+  CtrlResponse<OpenCreateResponse> open_create_resp(false);
+  TF_RETURN_IF_ERROR(client->OpenCreate(&open_create_resp, path));
+
+  int64 resource_id = open_create_resp.res.stream_id;
+  result->reset(new IGFSWritableFile(path, resource_id, std::move(client)));
+
+  LOG(INFO) << "New writable file completed successfully [file_name="
+            << file_name << "]";
+
+  return Status::OK();
+}
+
+Status IGFS::NewAppendableFile(const string &file_name,
+                               std::unique_ptr<WritableFile> *result) {
+  std::unique_ptr<IGFSClient> client = CreateClient();
+
+  CtrlResponse<HandshakeResponse> handshake_response(true);
+  TF_RETURN_IF_ERROR(client->Handshake(&handshake_response));
+
+  CtrlResponse<ExistsResponse> exists_response(false);
+  TF_RETURN_IF_ERROR(client->Exists(&exists_response, file_name));
+
+  if (exists_response.res.exists) {
+    CtrlResponse<DeleteResponse> del_response(false);
+    TF_RETURN_IF_ERROR(client->Delete(&del_response, file_name, false));
+  }
+
+  CtrlResponse<OpenAppendResponse> open_append_resp(false);
+  TF_RETURN_IF_ERROR(client->OpenAppend(&open_append_resp, file_name));
+
+  result->reset(new IGFSWritableFile(TranslateName(file_name),
+                                     open_append_resp.res.stream_id,
+                                     std::move(client)));
+
+  LOG(INFO) << "New appendable file completed successfully [file_name="
+            << file_name << "]";
+
+  return Status::OK();
+}
+
+Status IGFS::NewReadOnlyMemoryRegionFromFile(
+    const string &file_name, std::unique_ptr<ReadOnlyMemoryRegion> *result) {
+  return errors::Unimplemented("IGFS does not support ReadOnlyMemoryRegion");
+}
+
+Status IGFS::FileExists(const string &file_name) {
+  std::unique_ptr<IGFSClient> client = CreateClient();
+  const string path = TranslateName(file_name);
+
+  CtrlResponse<HandshakeResponse> handshake_response(true);
+  TF_RETURN_IF_ERROR(client->Handshake(&handshake_response));
+
+  CtrlResponse<ExistsResponse> exists_response(false);
+  TF_RETURN_IF_ERROR(client->Exists(&exists_response, path));
+
+  if (!exists_response.res.exists)
+    return errors::NotFound("File ", path, " not found");
+
+  LOG(INFO) << "File exists completed successfully [file_name=" << file_name
+            << "]";
+
+  return Status::OK();
+}
+
+Status IGFS::GetChildren(const string &file_name, std::vector<string> *result) {
+  std::unique_ptr<IGFSClient> client = CreateClient();
+  string path = TranslateName(file_name);
+  path = path + "/";
+
+  CtrlResponse<HandshakeResponse> handshake_response(true);
+  TF_RETURN_IF_ERROR(client->Handshake(&handshake_response));
+
+  CtrlResponse<ListPathsResponse> list_paths_response(false);
+  TF_RETURN_IF_ERROR(client->ListPaths(&list_paths_response, path));
+
+  *result = std::vector<string>();
+  std::vector<IGFSPath> entries = list_paths_response.res.entries;
+
+  for (IGFSPath &value : entries)
+    result->push_back(MakeRelative(value.path, path));
+
+  LOG(INFO) << "Get children completed successfully [file_name=" << file_name
+            << "]";
+
+  return Status::OK();
+}
+
+Status IGFS::GetMatchingPaths(const string &pattern,
+                              std::vector<string> *results) {
+  return internal::GetMatchingPaths(this, Env::Default(), pattern, results);
+}
+
+Status IGFS::DeleteFile(const string &file_name) {
+  std::unique_ptr<IGFSClient> client = CreateClient();
+  string path = TranslateName(file_name);
+
+  CtrlResponse<HandshakeResponse> handshake_response(true);
+  TF_RETURN_IF_ERROR(client->Handshake(&handshake_response));
+
+  CtrlResponse<DeleteResponse> del_response(false);
+  TF_RETURN_IF_ERROR(client->Delete(&del_response, path, false));
+
+  if (!del_response.res.exists)
+    return errors::NotFound("File ", path, " not found");
+
+  LOG(INFO) << "Delete file completed successfully [file_name=" << file_name
+            << "]";
+
+  return Status::OK();
+}
+
+Status IGFS::CreateDir(const string &file_name) {
+  std::unique_ptr<IGFSClient> client = CreateClient();
+  const string path = TranslateName(file_name);
+
+  CtrlResponse<HandshakeResponse> handshake_response(true);
+  TF_RETURN_IF_ERROR(client->Handshake(&handshake_response));
+
+  CtrlResponse<MakeDirectoriesResponse> mkdir_response(false);
+  TF_RETURN_IF_ERROR(client->MkDir(&mkdir_response, path));
+
+  if (!mkdir_response.res.successful)
+    return errors::Unknown("Can't create directory ", path);
+
+  LOG(INFO) << "Create dir completed successful [file_name=" << file_name
+            << "]";
+
+  return Status::OK();
+}
+
+Status IGFS::DeleteDir(const string &file_name) {
+  std::unique_ptr<IGFSClient> client = CreateClient();
+  string path = TranslateName(file_name);
+
+  CtrlResponse<HandshakeResponse> handshake_response(true);
+  TF_RETURN_IF_ERROR(client->Handshake(&handshake_response));
+
+  CtrlResponse<ListFilesResponse> list_files_response(false);
+  TF_RETURN_IF_ERROR(client->ListFiles(&list_files_response, path));
+
+  if (!list_files_response.res.entries.empty()) {
+    return errors::FailedPrecondition("Can't delete a non-empty directory");
+  } else {
+    CtrlResponse<DeleteResponse> del_response(false);
+    TF_RETURN_IF_ERROR(client->Delete(&del_response, path, true));
+  }
+
+  LOG(INFO) << "Delete dir completed successful [file_name=" << file_name
+            << "]";
+
+  return Status::OK();
+}
+
+Status IGFS::GetFileSize(const string &file_name, uint64 *size) {
+  std::unique_ptr<IGFSClient> client = CreateClient();
+  string path = TranslateName(file_name);
+
+  CtrlResponse<HandshakeResponse> handshake_response(true);
+  TF_RETURN_IF_ERROR(client->Handshake(&handshake_response));
+
+  CtrlResponse<InfoResponse> info_response(false);
+  TF_RETURN_IF_ERROR(client->Info(&info_response, path));
+
+  *size = info_response.res.file_info.length;
+
+  LOG(INFO) << "Get file size completed successful [file_name=" << file_name
+            << "]";
+
+  return Status::OK();
+}
+
+Status IGFS::RenameFile(const string &src, const string &dst) {
+  std::unique_ptr<IGFSClient> client = CreateClient();
+  string src_path = TranslateName(src);
+  string dst_path = TranslateName(dst);
+
+  if (FileExists(dst).ok()) DeleteFile(dst);
+
+  CtrlResponse<HandshakeResponse> handshake_response(true);
+  TF_RETURN_IF_ERROR(client->Handshake(&handshake_response));
+
+  CtrlResponse<RenameResponse> rename_response(false);
+  TF_RETURN_IF_ERROR(client->Rename(&rename_response, src_path, dst_path));
+
+  if (!rename_response.res.successful)
+    return errors::NotFound("File ", src_path, " not found");
+
+  LOG(INFO) << "Rename file completed successful [src=" << src
+            << ", dst=" << dst << "]";
+
+  return Status::OK();
+}
+
+Status IGFS::Stat(const string &file_name, FileStatistics *stats) {
+  std::unique_ptr<IGFSClient> client = CreateClient();
+  string path = TranslateName(file_name);
+
+  CtrlResponse<HandshakeResponse> handshake_response(true);
+  TF_RETURN_IF_ERROR(client->Handshake(&handshake_response));
+
+  CtrlResponse<InfoResponse> info_response(false);
+  TF_RETURN_IF_ERROR(client->Info(&info_response, path));
+
+  IGFSFile info = info_response.res.file_info;
+
+  *stats = FileStatistics(info.length, info.modification_time * 1000000,
+                          (info.flags & 0x1) != 0);
+
+  LOG(INFO) << "Stat completed successful [file_name=" << file_name << "]";
+
+  return Status::OK();
+}
+
+std::unique_ptr<IGFSClient> IGFS::CreateClient() const {
+  return std::unique_ptr<IGFSClient>(
+      new IGFSClient(host_, port_, fs_name_, ""));
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/ignite/kernels/igfs/igfs.h b/tensorflow/contrib/ignite/kernels/igfs/igfs.h
new file mode 100644
index 00000000000..4c347e937f7
--- /dev/null
+++ b/tensorflow/contrib/ignite/kernels/igfs/igfs.h
@@ -0,0 +1,60 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGFS_IGFS_H_
+#define TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGFS_IGFS_H_
+
+#include "tensorflow/contrib/ignite/kernels/igfs/igfs_client.h"
+#include "tensorflow/core/platform/file_system.h"
+
+namespace tensorflow {
+
+class IGFS : public FileSystem {
+ public:
+  IGFS();
+  ~IGFS();
+  Status NewRandomAccessFile(
+      const string& file_name,
+      std::unique_ptr<RandomAccessFile>* result) override;
+  Status NewWritableFile(const string& fname,
+                         std::unique_ptr<WritableFile>* result) override;
+  Status NewAppendableFile(const string& fname,
+                           std::unique_ptr<WritableFile>* result) override;
+  Status NewReadOnlyMemoryRegionFromFile(
+      const string& fname,
+      std::unique_ptr<ReadOnlyMemoryRegion>* result) override;
+  Status FileExists(const string& fname) override;
+  Status GetChildren(const string& dir, std::vector<string>* result) override;
+  Status GetMatchingPaths(const string& pattern,
+                          std::vector<string>* results) override;
+  Status DeleteFile(const string& fname) override;
+  Status CreateDir(const string& name) override;
+  Status DeleteDir(const string& name) override;
+  Status GetFileSize(const string& fname, uint64* size) override;
+  Status RenameFile(const string& src, const string& target) override;
+  Status Stat(const string& fname, FileStatistics* stat) override;
+  string TranslateName(const string& name) const override;
+
+ private:
+  std::unique_ptr<IGFSClient> CreateClient() const;
+
+  const string host_;
+  const int port_;
+  const string fs_name_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGFS_IGFS_H_
diff --git a/tensorflow/contrib/ignite/kernels/igfs/igfs_client.cc b/tensorflow/contrib/ignite/kernels/igfs/igfs_client.cc
new file mode 100644
index 00000000000..3f97c34fdd8
--- /dev/null
+++ b/tensorflow/contrib/ignite/kernels/igfs/igfs_client.cc
@@ -0,0 +1,43 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/ignite/kernels/igfs/igfs_client.h"
+
+namespace tensorflow {
+
+IGFSClient::IGFSClient(const string &host, int port, const string &fs_name,
+                       const string &user_name)
+    : fs_name_(fs_name),
+      user_name_(user_name),
+      client_(ExtendedTCPClient(host, port, true)) {
+  client_.Connect();
+}
+
+IGFSClient::~IGFSClient() { client_.Disconnect(); }
+
+Status IGFSClient::SendRequestGetResponse(const Request &request,
+                                          Response *response) {
+  TF_RETURN_IF_ERROR(request.Write(&client_));
+  client_.reset();
+
+  if (response != nullptr) {
+    TF_RETURN_IF_ERROR(response->Read(&client_));
+    client_.reset();
+  }
+
+  return Status::OK();
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/ignite/kernels/igfs/igfs_client.h b/tensorflow/contrib/ignite/kernels/igfs/igfs_client.h
new file mode 100644
index 00000000000..bbec7b00077
--- /dev/null
+++ b/tensorflow/contrib/ignite/kernels/igfs/igfs_client.h
@@ -0,0 +1,102 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGFS_IGFS_CLIENT_H_
+#define TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGFS_IGFS_CLIENT_H_
+
+#include "tensorflow/contrib/ignite/kernels/igfs/igfs_messages.h"
+
+namespace tensorflow {
+
+class IGFSClient {
+ public:
+  IGFSClient(const string &host, int port, const string &fs_name,
+             const string &user_name);
+  ~IGFSClient();
+
+  Status Handshake(CtrlResponse<HandshakeResponse> *res) {
+    return SendRequestGetResponse(HandshakeRequest(fs_name_, {}), res);
+  }
+
+  Status ListFiles(CtrlResponse<ListFilesResponse> *res, const string &path) {
+    return SendRequestGetResponse(ListFilesRequest(user_name_, path), res);
+  }
+
+  Status ListPaths(CtrlResponse<ListPathsResponse> *res, const string &path) {
+    return SendRequestGetResponse(ListPathsRequest(user_name_, path), res);
+  }
+
+  Status Info(CtrlResponse<InfoResponse> *res, const string &path) {
+    return SendRequestGetResponse(InfoRequest(user_name_, path), res);
+  }
+
+  Status OpenCreate(CtrlResponse<OpenCreateResponse> *res, const string &path) {
+    return SendRequestGetResponse(OpenCreateRequest(user_name_, path), res);
+  }
+
+  Status OpenAppend(CtrlResponse<OpenAppendResponse> *res, const string &path) {
+    return SendRequestGetResponse(OpenAppendRequest(user_name_, path), res);
+  }
+
+  Status OpenRead(CtrlResponse<OpenReadResponse> *res, const string &path) {
+    return SendRequestGetResponse(OpenReadRequest(user_name_, path), res);
+  }
+
+  Status Exists(CtrlResponse<ExistsResponse> *res, const string &path) {
+    return SendRequestGetResponse(ExistsRequest(user_name_, path), res);
+  }
+
+  Status MkDir(CtrlResponse<MakeDirectoriesResponse> *res, const string &path) {
+    return SendRequestGetResponse(MakeDirectoriesRequest(user_name_, path),
+                                  res);
+  }
+
+  Status Delete(CtrlResponse<DeleteResponse> *res, const string &path,
+                bool recursive) {
+    return SendRequestGetResponse(DeleteRequest(user_name_, path, recursive),
+                                  res);
+  }
+
+  Status WriteBlock(int64_t stream_id, const uint8_t *data, int32_t len) {
+    return SendRequestGetResponse(WriteBlockRequest(stream_id, data, len),
+                                  nullptr);
+  }
+
+  Status ReadBlock(ReadBlockCtrlResponse *res, int64_t stream_id, int64_t pos,
+                   int32_t length) {
+    return SendRequestGetResponse(ReadBlockRequest(stream_id, pos, length),
+                                  res);
+  }
+
+  Status Close(CtrlResponse<CloseResponse> *res, int64_t stream_id) {
+    return SendRequestGetResponse(CloseRequest(stream_id), res);
+  }
+
+  Status Rename(CtrlResponse<RenameResponse> *res, const string &source,
+                const string &dest) {
+    return SendRequestGetResponse(RenameRequest(user_name_, source, dest), res);
+  }
+
+ private:
+  Status SendRequestGetResponse(const Request &request, Response *response);
+
+  const string fs_name_;
+  const string user_name_;
+  ExtendedTCPClient client_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGFS_IGFS_CLIENT_H_
diff --git a/tensorflow/contrib/ignite/kernels/igfs/igfs_extended_tcp_client.cc b/tensorflow/contrib/ignite/kernels/igfs/igfs_extended_tcp_client.cc
new file mode 100644
index 00000000000..ea63436546d
--- /dev/null
+++ b/tensorflow/contrib/ignite/kernels/igfs/igfs_extended_tcp_client.cc
@@ -0,0 +1,144 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/ignite/kernels/igfs/igfs_extended_tcp_client.h"
+
+namespace tensorflow {
+
+ExtendedTCPClient::ExtendedTCPClient(const string &host, int port,
+                                     bool big_endian)
+    : PlainClient(host, port, big_endian), pos_(0) {}
+
+Status ExtendedTCPClient::ReadData(uint8_t *buf, const int32_t length) {
+  TF_RETURN_IF_ERROR(PlainClient::ReadData(buf, length));
+  pos_ += length;
+
+  return Status::OK();
+}
+
+Status ExtendedTCPClient::WriteData(const uint8_t *buf, const int32_t length) {
+  TF_RETURN_IF_ERROR(PlainClient::WriteData(buf, length));
+  pos_ += length;
+
+  return Status::OK();
+}
+
+Status ExtendedTCPClient::Ignore(int n) {
+  uint8_t buf[n];
+  return ReadData(buf, n);
+}
+
+Status ExtendedTCPClient::SkipToPos(int target_pos) {
+  return Ignore(std::max(0, target_pos - pos_));
+}
+
+Status ExtendedTCPClient::ReadBool(bool *res) {
+  uint8_t buf = 0;
+  TF_RETURN_IF_ERROR(ReadData(&buf, 1));
+  *res = buf != 0;
+
+  return Status::OK();
+}
+
+Status ExtendedTCPClient::ReadNullableString(string *res) {
+  bool is_empty = false;
+  TF_RETURN_IF_ERROR(ReadBool(&is_empty));
+
+  if (!is_empty) {
+    TF_RETURN_IF_ERROR(ReadString(res));
+  }
+
+  return Status::OK();
+}
+
+Status ExtendedTCPClient::ReadString(string *res) {
+  int16_t length;
+  TF_RETURN_IF_ERROR(ReadShort(&length));
+
+  uint8_t *buf = new uint8_t[length];
+  Status status = ReadData(buf, length);
+
+  if (status.ok()) res->assign(reinterpret_cast<char *>(buf), length);
+
+  delete[] buf;
+  return status;
+}
+
+Status ExtendedTCPClient::ReadStringMap(std::map<string, string> *res) {
+  int size;
+  TF_RETURN_IF_ERROR(ReadInt(&size));
+
+  for (int i = 0; i < size; i++) {
+    string key;
+    string val;
+    TF_RETURN_IF_ERROR(ReadString(&key));
+    TF_RETURN_IF_ERROR(ReadString(&val));
+
+    res->insert(std::pair<string, string>(std::move(key), std::move(val)));
+  }
+
+  return Status::OK();
+}
+
+Status ExtendedTCPClient::WriteSize(std::map<string, string>::size_type s) {
+  return WriteInt(s);
+}
+
+Status ExtendedTCPClient::FillWithZerosUntil(int n) {
+  int to_skip = std::max(0, n - pos_);
+
+  for (int i = 0; i < to_skip; i++) {
+    TF_RETURN_IF_ERROR(WriteByte(0));
+  }
+
+  return Status::OK();
+}
+
+Status ExtendedTCPClient::WriteBool(bool val) {
+  return WriteByte((char)(val ? 1 : 0));
+}
+
+Status ExtendedTCPClient::WriteString(string str) {
+  if (!str.empty()) {
+    TF_RETURN_IF_ERROR(WriteBool(false));
+    size_t l = str.length();
+    if (l > std::numeric_limits<int16_t>::max())
+      return errors::InvalidArgument("String is too long");
+
+    TF_RETURN_IF_ERROR(WriteShort(l));
+    TF_RETURN_IF_ERROR(WriteData(reinterpret_cast<const uint8_t *>(str.c_str()),
+                                 str.length()));
+  } else {
+    TF_RETURN_IF_ERROR(WriteBool(true));
+  }
+
+  return Status::OK();
+}
+
+Status ExtendedTCPClient::WriteStringMap(std::map<string, string> map) {
+  std::map<string, string>::size_type size = map.size();
+  TF_RETURN_IF_ERROR(WriteSize(size));
+
+  for (auto &x : map) {
+    TF_RETURN_IF_ERROR(WriteString(x.first));
+    TF_RETURN_IF_ERROR(WriteString(x.second));
+  }
+
+  return Status::OK();
+}
+
+void ExtendedTCPClient::reset() { pos_ = 0; }
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/ignite/kernels/igfs/igfs_extended_tcp_client.h b/tensorflow/contrib/ignite/kernels/igfs/igfs_extended_tcp_client.h
new file mode 100644
index 00000000000..c5de342fd0c
--- /dev/null
+++ b/tensorflow/contrib/ignite/kernels/igfs/igfs_extended_tcp_client.h
@@ -0,0 +1,47 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGFS_IGFS_EXTENDED_TCP_CLIENT_H_
+#define TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGFS_IGFS_EXTENDED_TCP_CLIENT_H_
+
+#include "tensorflow/contrib/ignite/kernels/client/ignite_plain_client.h"
+
+namespace tensorflow {
+
+class ExtendedTCPClient : public PlainClient {
+ public:
+  ExtendedTCPClient(const string &host, int port, bool big_endian);
+  Status ReadData(uint8_t *buf, const int32_t length) override;
+  Status WriteData(const uint8_t *buf, const int32_t length) override;
+  Status Ignore(int n);
+  Status SkipToPos(int target_pos);
+  Status ReadBool(bool *res);
+  Status ReadNullableString(string *res);
+  Status ReadString(string *res);
+  Status ReadStringMap(std::map<string, string> *res);
+  Status WriteSize(std::map<string, string>::size_type s);
+  Status FillWithZerosUntil(int n);
+  Status WriteBool(bool val);
+  Status WriteString(string str);
+  Status WriteStringMap(std::map<string, string> map);
+  void reset();
+
+ private:
+  int pos_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGFS_IGFS_EXTENDED_TCP_CLIENT_H_
diff --git a/tensorflow/contrib/ignite/kernels/igfs/igfs_messages.cc b/tensorflow/contrib/ignite/kernels/igfs/igfs_messages.cc
new file mode 100644
index 00000000000..9c63f40f35f
--- /dev/null
+++ b/tensorflow/contrib/ignite/kernels/igfs/igfs_messages.cc
@@ -0,0 +1,344 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/ignite/kernels/igfs/igfs_messages.h"
+
+namespace tensorflow {
+
+Status IGFSPath::Read(ExtendedTCPClient *client) {
+  return client->ReadNullableString(&path);
+}
+
+Status IGFSFile::Read(ExtendedTCPClient *client) {
+  int32_t block_size;
+  int64_t group_block_size;
+  std::map<string, string> properties = {};
+  int64_t access_time;
+
+  bool has_path;
+  TF_RETURN_IF_ERROR(client->ReadBool(&has_path));
+  if (has_path) {
+    IGFSPath path = {};
+    TF_RETURN_IF_ERROR(path.Read(client));
+  }
+
+  TF_RETURN_IF_ERROR(client->ReadInt(&block_size));
+  TF_RETURN_IF_ERROR(client->ReadLong(&group_block_size));
+  TF_RETURN_IF_ERROR(client->ReadLong(&length));
+  TF_RETURN_IF_ERROR(client->ReadStringMap(&properties));
+  TF_RETURN_IF_ERROR(client->ReadLong(&access_time));
+  TF_RETURN_IF_ERROR(client->ReadLong(&modification_time));
+  TF_RETURN_IF_ERROR(client->ReadByte(&flags));
+
+  return Status::OK();
+}
+
+Request::Request(int32_t command_id) : command_id_(command_id) {}
+
+Status Request::Write(ExtendedTCPClient *client) const {
+  TF_RETURN_IF_ERROR(client->WriteByte(0));
+  TF_RETURN_IF_ERROR(client->FillWithZerosUntil(8));
+  TF_RETURN_IF_ERROR(client->WriteInt(command_id_));
+  TF_RETURN_IF_ERROR(client->FillWithZerosUntil(24));
+
+  return Status::OK();
+}
+
+Status Response::Read(ExtendedTCPClient *client) {
+  TF_RETURN_IF_ERROR(client->Ignore(1));
+  TF_RETURN_IF_ERROR(client->SkipToPos(8));
+  TF_RETURN_IF_ERROR(client->ReadInt(&req_id));
+  TF_RETURN_IF_ERROR(client->SkipToPos(24));
+  TF_RETURN_IF_ERROR(client->ReadInt(&res_type));
+
+  bool has_error;
+  TF_RETURN_IF_ERROR(client->ReadBool(&has_error));
+
+  if (has_error) {
+    int32_t error_code;
+    string error_msg;
+    TF_RETURN_IF_ERROR(client->ReadString(&error_msg));
+    TF_RETURN_IF_ERROR(client->ReadInt(&error_code));
+
+    return errors::Unknown("Error [code=", error_code, ", message=\"",
+                           error_msg, "\"]");
+  }
+
+  TF_RETURN_IF_ERROR(client->SkipToPos(header_size_ + 5));
+  TF_RETURN_IF_ERROR(client->ReadInt(&length));
+  TF_RETURN_IF_ERROR(client->SkipToPos(header_size_ + response_header_size_));
+
+  return Status::OK();
+}
+
+PathCtrlRequest::PathCtrlRequest(int32_t command_id_, const string &user_name,
+                                 const string &path,
+                                 const string &destination_path, bool flag,
+                                 bool collocate,
+                                 const std::map<string, string> &properties)
+    : Request(command_id_),
+      user_name_(user_name),
+      path_(path),
+      destination_path_(destination_path),
+      flag_(flag),
+      collocate_(collocate),
+      props_(properties) {}
+
+Status PathCtrlRequest::Write(ExtendedTCPClient *client) const {
+  TF_RETURN_IF_ERROR(Request::Write(client));
+
+  TF_RETURN_IF_ERROR(client->WriteString(user_name_));
+  TF_RETURN_IF_ERROR(WritePath(client, path_));
+  TF_RETURN_IF_ERROR(WritePath(client, destination_path_));
+  TF_RETURN_IF_ERROR(client->WriteBool(flag_));
+  TF_RETURN_IF_ERROR(client->WriteBool(collocate_));
+  TF_RETURN_IF_ERROR(client->WriteStringMap(props_));
+
+  return Status::OK();
+}
+
+Status PathCtrlRequest::WritePath(ExtendedTCPClient *client,
+                                  const string &path) const {
+  TF_RETURN_IF_ERROR(client->WriteBool(!path.empty()));
+  if (!path.empty()) TF_RETURN_IF_ERROR(client->WriteString(path));
+
+  return Status::OK();
+}
+
+Status StreamCtrlRequest::Write(ExtendedTCPClient *client) const {
+  TF_RETURN_IF_ERROR(client->WriteByte(0));
+  TF_RETURN_IF_ERROR(client->FillWithZerosUntil(8));
+  TF_RETURN_IF_ERROR(client->WriteInt(command_id_));
+  TF_RETURN_IF_ERROR(client->WriteLong(stream_id_));
+  TF_RETURN_IF_ERROR(client->WriteInt(length_));
+
+  return Status::OK();
+}
+
+StreamCtrlRequest::StreamCtrlRequest(int32_t command_id_, int64_t stream_id,
+                                     int32_t length)
+    : Request(command_id_), stream_id_(stream_id), length_(length) {}
+
+DeleteRequest::DeleteRequest(const string &user_name, const string &path,
+                             bool flag)
+    : PathCtrlRequest(DELETE_ID, user_name, path, {}, flag, true, {}) {}
+
+Status DeleteResponse::Read(ExtendedTCPClient *client) {
+  TF_RETURN_IF_ERROR(client->ReadBool(&exists));
+
+  return Status::OK();
+}
+
+ExistsRequest::ExistsRequest(const string &user_name, const string &path)
+    : PathCtrlRequest(EXISTS_ID, user_name, path, {}, false, true, {}) {}
+
+Status ExistsResponse::Read(ExtendedTCPClient *client) {
+  TF_RETURN_IF_ERROR(client->ReadBool(&exists));
+
+  return Status::OK();
+}
+
+HandshakeRequest::HandshakeRequest(const string &fs_name, const string &log_dir)
+    : Request(HANDSHAKE_ID), fs_name_(fs_name), log_dir_(log_dir) {}
+
+Status HandshakeRequest::Write(ExtendedTCPClient *client) const {
+  TF_RETURN_IF_ERROR(Request::Write(client));
+
+  TF_RETURN_IF_ERROR(client->WriteString(fs_name_));
+  TF_RETURN_IF_ERROR(client->WriteString(log_dir_));
+
+  return Status::OK();
+}
+
+Status HandshakeResponse::Read(ExtendedTCPClient *client) {
+  int64_t block_size;
+  bool sampling;
+
+  TF_RETURN_IF_ERROR(client->ReadNullableString(&fs_name));
+  TF_RETURN_IF_ERROR(client->ReadLong(&block_size));
+
+  bool has_sampling_;
+  TF_RETURN_IF_ERROR(client->ReadBool(&has_sampling_));
+
+  if (has_sampling_) {
+    TF_RETURN_IF_ERROR(client->ReadBool(&sampling));
+  }
+
+  return Status::OK();
+}
+
+ListRequest::ListRequest(int32_t command_id_, const string &user_name,
+                         const string &path)
+    : PathCtrlRequest(command_id_, user_name, path, {}, false, true, {}) {}
+
+ListFilesRequest::ListFilesRequest(const string &user_name, const string &path)
+    : ListRequest(LIST_FILES_ID, user_name, path) {}
+
+ListPathsRequest::ListPathsRequest(const string &user_name, const string &path)
+    : ListRequest(LIST_PATHS_ID, user_name, path) {}
+
+OpenCreateRequest::OpenCreateRequest(const string &user_name,
+                                     const string &path)
+    : PathCtrlRequest(OPEN_CREATE_ID, user_name, path, {}, false, true, {}) {}
+
+Status OpenCreateRequest::Write(ExtendedTCPClient *client) const {
+  TF_RETURN_IF_ERROR(PathCtrlRequest::Write(client));
+
+  TF_RETURN_IF_ERROR(client->WriteInt(replication_));
+  TF_RETURN_IF_ERROR(client->WriteLong(blockSize_));
+
+  return Status::OK();
+}
+
+Status OpenCreateResponse::Read(ExtendedTCPClient *client) {
+  TF_RETURN_IF_ERROR(client->ReadLong(&stream_id));
+
+  return Status::OK();
+}
+
+OpenAppendRequest::OpenAppendRequest(const string &user_name,
+                                     const string &path)
+    : PathCtrlRequest(OPEN_APPEND_ID, user_name, path, {}, false, true, {}) {}
+
+Status OpenAppendRequest::Write(ExtendedTCPClient *client) const {
+  TF_RETURN_IF_ERROR(PathCtrlRequest::Write(client));
+
+  return Status::OK();
+}
+
+Status OpenAppendResponse::Read(ExtendedTCPClient *client) {
+  TF_RETURN_IF_ERROR(client->ReadLong(&stream_id));
+
+  return Status::OK();
+}
+
+OpenReadRequest::OpenReadRequest(const string &user_name, const string &path,
+                                 bool flag,
+                                 int32_t sequential_reads_before_prefetch)
+    : PathCtrlRequest(OPEN_READ_ID, user_name, path, {}, flag, true, {}),
+      sequential_reads_before_prefetch_(sequential_reads_before_prefetch) {}
+
+OpenReadRequest::OpenReadRequest(const string &user_name, const string &path)
+    : OpenReadRequest(user_name, path, false, 0) {}
+
+Status OpenReadRequest::Write(ExtendedTCPClient *client) const {
+  TF_RETURN_IF_ERROR(PathCtrlRequest::Write(client));
+
+  if (flag_) {
+    TF_RETURN_IF_ERROR(client->WriteInt(sequential_reads_before_prefetch_));
+  }
+
+  return Status::OK();
+}
+
+Status OpenReadResponse::Read(ExtendedTCPClient *client) {
+  TF_RETURN_IF_ERROR(client->ReadLong(&stream_id));
+  TF_RETURN_IF_ERROR(client->ReadLong(&length));
+
+  return Status::OK();
+}
+
+InfoRequest::InfoRequest(const string &user_name, const string &path)
+    : PathCtrlRequest(INFO_ID, user_name, path, {}, false, true, {}) {}
+
+Status InfoResponse::Read(ExtendedTCPClient *client) {
+  file_info = IGFSFile();
+  TF_RETURN_IF_ERROR(file_info.Read(client));
+
+  return Status::OK();
+}
+
+MakeDirectoriesRequest::MakeDirectoriesRequest(const string &user_name,
+                                               const string &path)
+    : PathCtrlRequest(MKDIR_ID, user_name, path, {}, false, true, {}) {}
+
+Status MakeDirectoriesResponse::Read(ExtendedTCPClient *client) {
+  TF_RETURN_IF_ERROR(client->ReadBool(&successful));
+
+  return Status::OK();
+}
+
+CloseRequest::CloseRequest(int64_t streamId)
+    : StreamCtrlRequest(CLOSE_ID, streamId, 0) {}
+
+Status CloseResponse::Read(ExtendedTCPClient *client) {
+  TF_RETURN_IF_ERROR(client->ReadBool(&successful));
+
+  return Status::OK();
+}
+
+ReadBlockRequest::ReadBlockRequest(int64_t stream_id, int64_t pos,
+                                   int32_t length)
+    : StreamCtrlRequest(READ_BLOCK_ID, stream_id, length), pos(pos) {}
+
+Status ReadBlockRequest::Write(ExtendedTCPClient *client) const {
+  TF_RETURN_IF_ERROR(StreamCtrlRequest::Write(client));
+
+  TF_RETURN_IF_ERROR(client->WriteLong(pos));
+
+  return Status::OK();
+}
+
+Status ReadBlockResponse::Read(ExtendedTCPClient *client, int32_t length,
+                               uint8_t *dst) {
+  TF_RETURN_IF_ERROR(client->ReadData(dst, length));
+  successfully_read = length;
+
+  return Status::OK();
+}
+
+Status ReadBlockResponse::Read(ExtendedTCPClient *client) {
+  return Status::OK();
+}
+
+std::streamsize ReadBlockResponse::GetSuccessfullyRead() {
+  return successfully_read;
+}
+
+ReadBlockCtrlResponse::ReadBlockCtrlResponse(uint8_t *dst)
+    : CtrlResponse(false), dst(dst) {}
+
+Status ReadBlockCtrlResponse::Read(ExtendedTCPClient *client) {
+  TF_RETURN_IF_ERROR(Response::Read(client));
+
+  res = ReadBlockResponse();
+  TF_RETURN_IF_ERROR(res.Read(client, length, dst));
+
+  return Status::OK();
+}
+
+WriteBlockRequest::WriteBlockRequest(int64_t stream_id, const uint8_t *data,
+                                     int32_t length)
+    : StreamCtrlRequest(WRITE_BLOCK_ID, stream_id, length), data(data) {}
+
+Status WriteBlockRequest::Write(ExtendedTCPClient *client) const {
+  TF_RETURN_IF_ERROR(StreamCtrlRequest::Write(client));
+  TF_RETURN_IF_ERROR(client->WriteData((uint8_t *)data, length_));
+
+  return Status::OK();
+}
+
+RenameRequest::RenameRequest(const string &user_name, const string &path,
+                             const string &destination_path)
+    : PathCtrlRequest(RENAME_ID, user_name, path, destination_path, false, true,
+                      {}) {}
+
+Status RenameResponse::Read(ExtendedTCPClient *client) {
+  TF_RETURN_IF_ERROR(client->ReadBool(&successful));
+
+  return Status::OK();
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/ignite/kernels/igfs/igfs_messages.h b/tensorflow/contrib/ignite/kernels/igfs/igfs_messages.h
new file mode 100644
index 00000000000..44a2928a2b2
--- /dev/null
+++ b/tensorflow/contrib/ignite/kernels/igfs/igfs_messages.h
@@ -0,0 +1,356 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGFS_IGFS_MESSAGES_H_
+#define TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGFS_IGFS_MESSAGES_H_
+
+#include "tensorflow/contrib/ignite/kernels/igfs/igfs_extended_tcp_client.h"
+
+namespace tensorflow {
+
+enum CommandId {
+  HANDSHAKE_ID = 0,
+  EXISTS_ID = 2,
+  INFO_ID = 3,
+  RENAME_ID = 6,
+  DELETE_ID = 7,
+  MKDIR_ID = 8,
+  LIST_PATHS_ID = 9,
+  LIST_FILES_ID = 10,
+  OPEN_READ_ID = 13,
+  OPEN_APPEND_ID = 14,
+  OPEN_CREATE_ID = 15,
+  CLOSE_ID = 16,
+  READ_BLOCK_ID = 17,
+  WRITE_BLOCK_ID = 18,
+};
+
+class IGFSPath {
+ public:
+  Status Read(ExtendedTCPClient *client);
+
+  string path;
+};
+
+class IGFSFile {
+ public:
+  Status Read(ExtendedTCPClient *client);
+
+  int64_t length;
+  int64_t modification_time;
+  uint8_t flags;
+};
+
+class Request {
+ public:
+  Request(int32_t command_id);
+  virtual Status Write(ExtendedTCPClient *client) const;
+
+ protected:
+  const int32_t command_id_;
+};
+
+class Response {
+ public:
+  virtual Status Read(ExtendedTCPClient *client);
+
+  int32_t res_type;
+  int32_t req_id;
+  int32_t length;
+
+ protected:
+  static const int32_t header_size_ = 24;
+  static const int32_t response_header_size_ = 9;
+};
+
+class PathCtrlRequest : public Request {
+ public:
+  PathCtrlRequest(int32_t command_id, const string &user_name,
+                  const string &path, const string &destination_path, bool flag,
+                  bool collocate, const std::map<string, string> &properties);
+  Status Write(ExtendedTCPClient *client) const override;
+
+ protected:
+  Status WritePath(ExtendedTCPClient *client, const string &path) const;
+
+  const string user_name_;
+  const string path_;
+  const string destination_path_;
+  const bool flag_;
+  const bool collocate_;
+  const std::map<string, string> props_;
+};
+
+class StreamCtrlRequest : public Request {
+ public:
+  StreamCtrlRequest(int32_t command_id, int64_t stream_id, int32_t length);
+  Status Write(ExtendedTCPClient *client) const override;
+
+ protected:
+  int64_t stream_id_;
+  int32_t length_;
+};
+
+template <class R>
+class CtrlResponse : public Response {
+ public:
+  CtrlResponse(bool optional) : optional_(optional) {}
+  Status Read(ExtendedTCPClient *client) override {
+    TF_RETURN_IF_ERROR(Response::Read(client));
+
+    if (optional_) {
+      TF_RETURN_IF_ERROR(client->ReadBool(&has_content));
+
+      if (!has_content) return Status::OK();
+    }
+
+    res = R();
+    has_content = true;
+    TF_RETURN_IF_ERROR(res.Read(client));
+
+    return Status::OK();
+  }
+
+  R res;
+  bool has_content;
+
+ private:
+  bool optional_;
+};
+
+template <class T>
+class ListResponse {
+ public:
+  Status Read(ExtendedTCPClient *client) {
+    int32_t len;
+    TF_RETURN_IF_ERROR(client->ReadInt(&len));
+
+    entries.clear();
+
+    for (int32_t i = 0; i < len; i++) {
+      T f = {};
+      TF_RETURN_IF_ERROR(f.Read(client));
+      entries.push_back(f);
+    }
+
+    return Status::OK();
+  }
+
+  std::vector<T> entries;
+};
+
+class DeleteRequest : public PathCtrlRequest {
+ public:
+  DeleteRequest(const string &user_name, const string &path, bool flag);
+};
+
+class DeleteResponse {
+ public:
+  Status Read(ExtendedTCPClient *client);
+
+  bool exists;
+};
+
+class ExistsRequest : public PathCtrlRequest {
+ public:
+  explicit ExistsRequest(const string &user_name, const string &path);
+};
+
+class ExistsResponse {
+ public:
+  Status Read(ExtendedTCPClient *client);
+
+  bool exists;
+};
+
+class HandshakeRequest : public Request {
+ public:
+  HandshakeRequest(const string &fs_name, const string &log_dir);
+  Status Write(ExtendedTCPClient *client) const override;
+
+ private:
+  string fs_name_;
+  string log_dir_;
+};
+
+class HandshakeResponse {
+ public:
+  Status Read(ExtendedTCPClient *client);
+
+  string fs_name;
+};
+
+class ListRequest : public PathCtrlRequest {
+ public:
+  explicit ListRequest(int32_t command_id, const string &user_name,
+                       const string &path);
+};
+
+class ListFilesRequest : public ListRequest {
+ public:
+  ListFilesRequest(const string &user_name, const string &path);
+};
+
+class ListFilesResponse : public ListResponse<IGFSFile> {};
+
+class ListPathsRequest : public ListRequest {
+ public:
+  ListPathsRequest(const string &user_name, const string &path);
+};
+
+class ListPathsResponse : public ListResponse<IGFSPath> {};
+
+class OpenCreateRequest : public PathCtrlRequest {
+ public:
+  OpenCreateRequest(const string &user_name, const string &path);
+  Status Write(ExtendedTCPClient *client) const override;
+
+ private:
+  int32_t replication_;
+  int64_t blockSize_;
+};
+
+class OpenCreateResponse {
+ public:
+  Status Read(ExtendedTCPClient *client);
+
+  int64_t stream_id;
+};
+
+class OpenAppendRequest : public PathCtrlRequest {
+ public:
+  explicit OpenAppendRequest(const string &user_name, const string &path);
+  Status Write(ExtendedTCPClient *client) const override;
+};
+
+class OpenAppendResponse {
+ public:
+  Status Read(ExtendedTCPClient *client);
+
+  int64_t stream_id;
+};
+
+class OpenReadRequest : public PathCtrlRequest {
+ public:
+  OpenReadRequest(const string &user_name, const string &path, bool flag,
+                  int32_t seqReadsBeforePrefetch);
+  OpenReadRequest(const string &user_name, const string &path);
+  Status Write(ExtendedTCPClient *client) const override;
+
+ protected:
+  /** Sequential reads before prefetch. */
+  int32_t sequential_reads_before_prefetch_;
+};
+
+class OpenReadResponse {
+ public:
+  Status Read(ExtendedTCPClient *client);
+
+  int64_t stream_id;
+  int64_t length;
+};
+
+class InfoRequest : public PathCtrlRequest {
+ public:
+  InfoRequest(const string &user_name, const string &path);
+};
+
+class InfoResponse {
+ public:
+  Status Read(ExtendedTCPClient *client);
+
+  IGFSFile file_info;
+};
+
+class MakeDirectoriesRequest : public PathCtrlRequest {
+ public:
+  MakeDirectoriesRequest(const string &userName, const string &path);
+};
+
+class MakeDirectoriesResponse {
+ public:
+  Status Read(ExtendedTCPClient *client);
+
+  bool successful;
+};
+
+/** Stream control requests. **/
+
+class CloseRequest : public StreamCtrlRequest {
+ public:
+  explicit CloseRequest(int64_t stream_id);
+};
+
+class CloseResponse {
+ public:
+  Status Read(ExtendedTCPClient *client);
+
+  bool successful;
+};
+
+class ReadBlockRequest : public StreamCtrlRequest {
+ public:
+  ReadBlockRequest(int64_t stream_id, int64_t pos, int32_t length);
+  Status Write(ExtendedTCPClient *client) const override;
+
+ private:
+  int64_t pos;
+};
+
+class ReadBlockResponse {
+ public:
+  Status Read(ExtendedTCPClient *client, int32_t length, uint8_t *dst);
+  Status Read(ExtendedTCPClient *client);
+  std::streamsize GetSuccessfullyRead();
+
+ private:
+  int32_t length;
+  std::streamsize successfully_read;
+};
+
+class ReadBlockCtrlResponse : public CtrlResponse<ReadBlockResponse> {
+ public:
+  ReadBlockCtrlResponse(uint8_t *dst);
+  Status Read(ExtendedTCPClient *client) override;
+
+ private:
+  uint8_t *dst;
+};
+
+class WriteBlockRequest : public StreamCtrlRequest {
+ public:
+  WriteBlockRequest(int64_t stream_id, const uint8_t *data, int32_t length);
+  Status Write(ExtendedTCPClient *client) const override;
+
+ private:
+  const uint8_t *data;
+};
+
+class RenameRequest : public PathCtrlRequest {
+ public:
+  RenameRequest(const string &user_name, const string &path,
+                const string &destination_path);
+};
+
+class RenameResponse {
+ public:
+  Status Read(ExtendedTCPClient *client);
+
+  bool successful;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGFS_IGFS_MESSAGES_H_
diff --git a/tensorflow/contrib/ignite/kernels/igfs/igfs_random_access_file.cc b/tensorflow/contrib/ignite/kernels/igfs/igfs_random_access_file.cc
new file mode 100644
index 00000000000..a4c898f14e6
--- /dev/null
+++ b/tensorflow/contrib/ignite/kernels/igfs/igfs_random_access_file.cc
@@ -0,0 +1,48 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/ignite/kernels/igfs/igfs_random_access_file.h"
+#include "tensorflow/contrib/ignite/kernels/igfs/igfs_messages.h"
+
+namespace tensorflow {
+
+IGFSRandomAccessFile::IGFSRandomAccessFile(const string &file_name,
+                                           int64_t resource_id,
+                                           std::unique_ptr<IGFSClient> &&client)
+    : file_name_(file_name),
+      resource_id_(resource_id),
+      client_(std::move(client)) {}
+
+IGFSRandomAccessFile::~IGFSRandomAccessFile() {
+  CtrlResponse<CloseResponse> close_response = {false};
+  Status status = client_->Close(&close_response, resource_id_);
+
+  if (!status.ok()) LOG(ERROR) << status.ToString();
+}
+
+Status IGFSRandomAccessFile::Read(uint64 offset, size_t n, StringPiece *result,
+                                  char *scratch) const {
+  ReadBlockCtrlResponse response = ReadBlockCtrlResponse((uint8_t *)scratch);
+  TF_RETURN_IF_ERROR(client_->ReadBlock(&response, resource_id_, offset, n));
+
+  std::streamsize sz = response.res.GetSuccessfullyRead();
+  if (sz == 0) return errors::OutOfRange("End of file");
+
+  *result = StringPiece(scratch, sz);
+
+  return Status::OK();
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/ignite/kernels/igfs/igfs_random_access_file.h b/tensorflow/contrib/ignite/kernels/igfs/igfs_random_access_file.h
new file mode 100644
index 00000000000..b21369ff8a3
--- /dev/null
+++ b/tensorflow/contrib/ignite/kernels/igfs/igfs_random_access_file.h
@@ -0,0 +1,40 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGFS_IGFS_RANDOM_ACCESS_FILE_H_
+#define TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGFS_IGFS_RANDOM_ACCESS_FILE_H_
+
+#include "tensorflow/contrib/ignite/kernels/igfs/igfs_client.h"
+#include "tensorflow/core/platform/file_system.h"
+
+namespace tensorflow {
+
+class IGFSRandomAccessFile : public RandomAccessFile {
+ public:
+  IGFSRandomAccessFile(const string &file_name, int64_t resource_id,
+                       std::unique_ptr<IGFSClient> &&client);
+  ~IGFSRandomAccessFile() override;
+  Status Read(uint64 offset, size_t n, StringPiece *result,
+              char *scratch) const override;
+
+ private:
+  const string file_name_;
+  const int64_t resource_id_;
+  std::unique_ptr<IGFSClient> client_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGFS_IGFS_RANDOM_ACCESS_FILE_H_
diff --git a/tensorflow/contrib/ignite/kernels/igfs/igfs_writable_file.cc b/tensorflow/contrib/ignite/kernels/igfs/igfs_writable_file.cc
new file mode 100644
index 00000000000..c15ecb7deeb
--- /dev/null
+++ b/tensorflow/contrib/ignite/kernels/igfs/igfs_writable_file.cc
@@ -0,0 +1,62 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/ignite/kernels/igfs/igfs_writable_file.h"
+#include "tensorflow/contrib/ignite/kernels/igfs/igfs_messages.h"
+
+namespace tensorflow {
+
+IGFSWritableFile::IGFSWritableFile(const string &file_name, int64_t resource_id,
+                                   std::unique_ptr<IGFSClient> &&client)
+    : file_name_(file_name),
+      resource_id_(resource_id),
+      client_(std::move(client)) {}
+
+IGFSWritableFile::~IGFSWritableFile() {
+  if (resource_id_ >= 0) {
+    CtrlResponse<CloseResponse> close_response = {false};
+
+    Status status = client_->Close(&close_response, resource_id_);
+    if (!status.ok()) LOG(ERROR) << status.ToString();
+  }
+}
+
+Status IGFSWritableFile::Append(StringPiece data) {
+  return client_->WriteBlock(resource_id_, (uint8_t *)data.data(), data.size());
+}
+
+Status IGFSWritableFile::Close() {
+  int64_t resource_to_be_closed = resource_id_;
+  resource_id_ = -1;
+
+  CtrlResponse<CloseResponse> close_response = {false};
+  return client_->Close(&close_response, resource_to_be_closed);
+}
+
+Status IGFSWritableFile::Flush() { return Sync(); }
+
+Status IGFSWritableFile::Sync() {
+  CtrlResponse<CloseResponse> close_response = {false};
+  TF_RETURN_IF_ERROR(client_->Close(&close_response, resource_id_));
+
+  CtrlResponse<OpenAppendResponse> open_append_resp(false);
+  TF_RETURN_IF_ERROR(client_->OpenAppend(&open_append_resp, file_name_));
+
+  resource_id_ = open_append_resp.res.stream_id;
+
+  return Status::OK();
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/ignite/kernels/igfs/igfs_writable_file.h b/tensorflow/contrib/ignite/kernels/igfs/igfs_writable_file.h
new file mode 100644
index 00000000000..b406db17e0e
--- /dev/null
+++ b/tensorflow/contrib/ignite/kernels/igfs/igfs_writable_file.h
@@ -0,0 +1,42 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGFS_IGFS_WRITABLE_FILE_H_
+#define TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGFS_IGFS_WRITABLE_FILE_H_
+
+#include "tensorflow/contrib/ignite/kernels/igfs/igfs_client.h"
+#include "tensorflow/core/platform/file_system.h"
+
+namespace tensorflow {
+
+class IGFSWritableFile : public WritableFile {
+ public:
+  IGFSWritableFile(const string &file_name, int64_t resource_id,
+                   std::unique_ptr<IGFSClient> &&client);
+  ~IGFSWritableFile() override;
+  Status Append(StringPiece data) override;
+  Status Close() override;
+  Status Flush() override;
+  Status Sync() override;
+
+ private:
+  const string file_name_;
+  int64_t resource_id_;
+  std::unique_ptr<IGFSClient> client_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGFS_IGFS_WRITABLE_FILE_H_
diff --git a/tensorflow/core/kernels/fuzzing/decode_jpeg_fuzz.cc b/tensorflow/contrib/ignite/ops/igfs_ops.cc
similarity index 62%
rename from tensorflow/core/kernels/fuzzing/decode_jpeg_fuzz.cc
rename to tensorflow/contrib/ignite/ops/igfs_ops.cc
index f3b24b2341e..473bddff08b 100644
--- a/tensorflow/core/kernels/fuzzing/decode_jpeg_fuzz.cc
+++ b/tensorflow/contrib/ignite/ops/igfs_ops.cc
@@ -1,4 +1,4 @@
-/* Copyright 2016 Google Inc. All Rights Reserved.
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,17 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/cc/ops/standard_ops.h"
-#include "tensorflow/core/kernels/fuzzing/fuzz_session.h"
+#include "tensorflow/core/platform/env.h"
+
+#include "tensorflow/contrib/ignite/kernels/igfs/igfs.h"
 
 namespace tensorflow {
-namespace fuzzing {
 
-class FuzzDecodeJpeg : public FuzzStringInputOp {
-  SINGLE_INPUT_OP_BUILDER(DT_STRING, DecodeJpeg);
-};
+REGISTER_FILE_SYSTEM("igfs", IGFS);
 
-STANDARD_TF_FUZZ_FUNCTION(FuzzDecodeJpeg);
-
-}  // end namespace fuzzing
-}  // end namespace tensorflow
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/ignite/python/ops/igfs_op_loader.py b/tensorflow/contrib/ignite/python/ops/igfs_op_loader.py
new file mode 100644
index 00000000000..8e1d6707d64
--- /dev/null
+++ b/tensorflow/contrib/ignite/python/ops/igfs_op_loader.py
@@ -0,0 +1,24 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Python helper for loading IGFS ops and kernels."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.util import loader
+from tensorflow.python.platform import resource_loader
+
+_dataset_ops = loader.load_op_library(
+    resource_loader.get_path_to_datafile("../../_ignite_ops.so"))
diff --git a/tensorflow/contrib/ignite/python/ops/igfs_ops.py b/tensorflow/contrib/ignite/python/ops/igfs_ops.py
new file mode 100644
index 00000000000..12b973b7077
--- /dev/null
+++ b/tensorflow/contrib/ignite/python/ops/igfs_ops.py
@@ -0,0 +1,40 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Ignite File System for checkpointing and communication with TensorBoard.
+
+Apache Ignite is a memory-centric distributed database, caching, and
+processing platform for transactional, analytical, and streaming workloads,
+delivering in-memory speeds at petabyte scale. In addition to database
+functionality Apache Ignite provides a distributed file system called
+IGFS (https://ignite.apache.org/features/igfs.html). IGFS delivers a similar
+functionality to Hadoop HDFS, but only in-memory. In fact, in addition to
+its own APIs, IGFS implements Hadoop FileSystem API and can be transparently
+plugged into Hadoop or Spark deployments. This contrib package contains an
+integration between IGFS and TensorFlow.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+from tensorflow.contrib.ignite.python.ops import ignite_op_loader  # pylint: disable=unused-import
+from tensorflow.python.framework import load_library
+from tensorflow.python.platform import resource_loader
+
+file_system_library = os.path.join(resource_loader.get_data_files_path(),
+                                   "../../_ignite_ops.so")
+load_library.load_file_system_library(file_system_library)
diff --git a/tensorflow/contrib/ignite/python/ops/ignite_op_loader.py b/tensorflow/contrib/ignite/python/ops/ignite_op_loader.py
index c9af7386cf0..e450e2d84ba 100644
--- a/tensorflow/contrib/ignite/python/ops/ignite_op_loader.py
+++ b/tensorflow/contrib/ignite/python/ops/ignite_op_loader.py
@@ -21,4 +21,4 @@ from tensorflow.contrib.util import loader
 from tensorflow.python.platform import resource_loader
 
 _dataset_ops = loader.load_op_library(
-    resource_loader.get_path_to_datafile("../../_dataset_ops.so"))
+    resource_loader.get_path_to_datafile("../../_ignite_ops.so"))
diff --git a/tensorflow/contrib/signal/python/__init__.py b/tensorflow/contrib/ignite/python/tests/bin/start-igfs.sh
old mode 100644
new mode 100755
similarity index 73%
rename from tensorflow/contrib/signal/python/__init__.py
rename to tensorflow/contrib/ignite/python/tests/bin/start-igfs.sh
index e672d1146c5..5e39e16c052
--- a/tensorflow/contrib/signal/python/__init__.py
+++ b/tensorflow/contrib/ignite/python/tests/bin/start-igfs.sh
@@ -1,4 +1,5 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#!/usr/bin/env bash
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,8 +13,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Signal ops."""
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+nohup apache-ignite-fabric/bin/ignite.sh /data/config/ignite-config-igfs.xml &
+sleep 5 # Wait Apache Ignite to be started
+
+tail -f nohup.out
diff --git a/tensorflow/contrib/ignite/python/tests/config/ignite-config-igfs.xml b/tensorflow/contrib/ignite/python/tests/config/ignite-config-igfs.xml
new file mode 100644
index 00000000000..5d81bf33226
--- /dev/null
+++ b/tensorflow/contrib/ignite/python/tests/config/ignite-config-igfs.xml
@@ -0,0 +1,55 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+     http://www.apache.org/licenses/LICENSE-2.0
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<beans xmlns="http://www.springframework.org/schema/beans"
+       xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+       xmlns:util="http://www.springframework.org/schema/util"
+       xsi:schemaLocation="http://www.springframework.org/schema/beans
+       http://www.springframework.org/schema/beans/spring-beans.xsd
+       http://www.springframework.org/schema/util
+       http://www.springframework.org/schema/util/spring-util.xsd">
+
+  <bean class="org.apache.ignite.configuration.IgniteConfiguration">
+    <property name="fileSystemConfiguration">
+      <bean class="org.apache.ignite.configuration.FileSystemConfiguration">
+        <!-- Distinguished file system name. -->
+        <property name="name" value="default_fs"/>
+        <property name="managementPort" value="9000"/>
+        <property name="ipcEndpointEnabled" value="true"/>
+        <property name="defaultMode" value="PRIMARY"/>
+        <property name="ipcEndpointConfiguration">
+          <bean class="org.apache.ignite.igfs.IgfsIpcEndpointConfiguration">
+            <property name="host" value="" />
+            <property name="port" value="10500"/>
+            <property name="type" value="TCP"/>
+          </bean>
+        </property>
+      </bean>
+    </property>
+    <property name="discoverySpi">
+      <bean class="org.apache.ignite.spi.discovery.tcp.TcpDiscoverySpi">
+        <property name="ipFinder">
+          <bean class="org.apache.ignite.spi.discovery.tcp.ipfinder.vm.TcpDiscoveryVmIpFinder">
+            <property name="addresses">
+              <list>
+                <value>127.0.0.1</value>
+              </list>
+            </property>
+          </bean>
+        </property>
+      </bean>
+    </property>
+  </bean>
+
+</beans>
diff --git a/tensorflow/contrib/ignite/python/tests/igfs_test.py b/tensorflow/contrib/ignite/python/tests/igfs_test.py
new file mode 100644
index 00000000000..cacfc568942
--- /dev/null
+++ b/tensorflow/contrib/ignite/python/tests/igfs_test.py
@@ -0,0 +1,215 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not
+# use this file except in compliance with the License.  You may obtain a copy of
+# the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
+# License for the specific language governing permissions and limitations under
+# the License.
+# ==============================================================================
+"""Tests for IGFS."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow.contrib.ignite.python.ops.igfs_ops  # pylint: disable=unused-import
+from tensorflow.python.platform import gfile
+from tensorflow.python.platform import test
+
+
+class IGFSTest(test.TestCase):
+  """The Apache Ignite servers have to setup before the test and tear down
+
+     after the test manually. The docker engine has to be installed.
+
+     To setup Apache Ignite servers:
+     $ bash start_ignite.sh
+
+     To tear down Apache Ignite servers:
+     $ bash stop_ignite.sh
+  """
+
+  def test_create_file(self):
+    """Test create file.
+
+    """
+    # Setup and check preconditions.
+    file_name = "igfs:///test_create_file/1"
+    self.assertFalse(gfile.Exists(file_name))
+    # Create file.
+    with gfile.Open(file_name, mode="w") as w:
+      w.write("")
+    # Check that file was created.
+    self.assertTrue(gfile.Exists(file_name))
+
+  def test_write_read_file(self):
+    """Test write/read file.
+
+    """
+    # Setup and check preconditions.
+    file_name = "igfs:///test_write_read_file/1"
+    rows = 10000
+    self.assertFalse(gfile.Exists(file_name))
+    # Write data.
+    with gfile.Open(file_name, mode="w") as w:
+      for i in range(rows):
+        w.write("This is row\n")
+    # Read data.
+    with gfile.Open(file_name, mode="r") as r:
+      lines = r.readlines()
+    # Check that data is equal.
+    self.assertEqual(rows, len(lines))
+    for i in range(rows):
+      self.assertEqual("This is row\n", lines[i])
+
+  def test_delete_recursively(self):
+    """Test delete recursively.
+
+    """
+    # Setup and check preconditions.
+    dir_name = "igfs:///test_delete_recursively/"
+    file_name = "igfs:///test_delete_recursively/1"
+    self.assertFalse(gfile.Exists(dir_name))
+    self.assertFalse(gfile.Exists(file_name))
+    gfile.MkDir(dir_name)
+    with gfile.Open(file_name, mode="w") as w:
+      w.write("")
+    self.assertTrue(gfile.Exists(dir_name))
+    self.assertTrue(gfile.Exists(file_name))
+    # Delete directory recursively.
+    gfile.DeleteRecursively(dir_name)
+    # Check that directory was deleted.
+    self.assertFalse(gfile.Exists(dir_name))
+    self.assertFalse(gfile.Exists(file_name))
+
+  def test_copy(self):
+    """Test copy.
+
+    """
+    # Setup and check preconditions.
+    src_file_name = "igfs:///test_copy/1"
+    dst_file_name = "igfs:///test_copy/2"
+    self.assertFalse(gfile.Exists(src_file_name))
+    self.assertFalse(gfile.Exists(dst_file_name))
+    with gfile.Open(src_file_name, mode="w") as w:
+      w.write("42")
+    self.assertTrue(gfile.Exists(src_file_name))
+    self.assertFalse(gfile.Exists(dst_file_name))
+    # Copy file.
+    gfile.Copy(src_file_name, dst_file_name)
+    # Check that files are identical.
+    self.assertTrue(gfile.Exists(src_file_name))
+    self.assertTrue(gfile.Exists(dst_file_name))
+    with gfile.Open(dst_file_name, mode="r") as r:
+      data = r.read()
+    self.assertEqual("42", data)
+
+  def test_is_directory(self):
+    """Test is directory.
+
+    """
+    # Setup and check preconditions.
+    dir_name = "igfs:///test_is_directory/1"
+    file_name = "igfs:///test_is_directory/2"
+    with gfile.Open(file_name, mode="w") as w:
+      w.write("")
+    gfile.MkDir(dir_name)
+    # Check that directory is a directory.
+    self.assertTrue(gfile.IsDirectory(dir_name))
+    # Check that file is not a directory.
+    self.assertFalse(gfile.IsDirectory(file_name))
+
+  def test_list_directory(self):
+    """Test list directory.
+
+    """
+    # Setup and check preconditions.
+    dir_name = "igfs:///test_list_directory/"
+    file_names = [
+        "igfs:///test_list_directory/1", "igfs:///test_list_directory/2/3"
+    ]
+    ch_dir_names = [
+        "igfs:///test_list_directory/4",
+    ]
+    for file_name in file_names:
+      with gfile.Open(file_name, mode="w") as w:
+        w.write("")
+    for ch_dir_name in ch_dir_names:
+      gfile.MkDir(ch_dir_name)
+    ls_expected_result = file_names + ch_dir_names
+    # Get list of files in directory.
+    ls_result = gfile.ListDirectory(dir_name)
+    # Check that list of files is correct.
+    self.assertEqual(len(ls_expected_result), len(ls_result))
+    for e in ["1", "2", "4"]:
+      self.assertTrue(e in ls_result)
+
+  def test_make_dirs(self):
+    """Test make dirs.
+
+    """
+    # Setup and check preconditions.
+    dir_name = "igfs:///test_make_dirs/"
+    self.assertFalse(gfile.Exists(dir_name))
+    # Make directory.
+    gfile.MkDir(dir_name)
+    # Check that directory was created.
+    self.assertTrue(gfile.Exists(dir_name))
+
+  def test_remove(self):
+    """Test remove.
+
+    """
+    # Setup and check preconditions.
+    file_name = "igfs:///test_remove/1"
+    self.assertFalse(gfile.Exists(file_name))
+    with gfile.Open(file_name, mode="w") as w:
+      w.write("")
+    self.assertTrue(gfile.Exists(file_name))
+    # Remove file.
+    gfile.Remove(file_name)
+    # Check that file was removed.
+    self.assertFalse(gfile.Exists(file_name))
+
+  def test_rename_file(self):
+    """Test rename file.
+
+    """
+    # Setup and check preconditions.
+    src_file_name = "igfs:///test_rename_file/1"
+    dst_file_name = "igfs:///test_rename_file/2"
+    with gfile.Open(src_file_name, mode="w") as w:
+      w.write("42")
+    self.assertTrue(gfile.Exists(src_file_name))
+    # Rename file.
+    gfile.Rename(src_file_name, dst_file_name)
+    # Check that only new name of file is available.
+    self.assertFalse(gfile.Exists(src_file_name))
+    self.assertTrue(gfile.Exists(dst_file_name))
+    with gfile.Open(dst_file_name, mode="r") as r:
+      data = r.read()
+    self.assertEqual("42", data)
+
+  def test_rename_dir(self):
+    """Test rename dir.
+
+    """
+    # Setup and check preconditions.
+    src_dir_name = "igfs:///test_rename_dir/1"
+    dst_dir_name = "igfs:///test_rename_dir/2"
+    gfile.MkDir(src_dir_name)
+    # Rename directory.
+    gfile.Rename(src_dir_name, dst_dir_name)
+    # Check that only new name of directory is available.
+    self.assertFalse(gfile.Exists(src_dir_name))
+    self.assertTrue(gfile.Exists(dst_dir_name))
+    self.assertTrue(gfile.IsDirectory(dst_dir_name))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/ignite/python/tests/start_ignite.sh b/tensorflow/contrib/ignite/python/tests/start_ignite.sh
index a67bd44f2fb..112e0dea844 100755
--- a/tensorflow/contrib/ignite/python/tests/start_ignite.sh
+++ b/tensorflow/contrib/ignite/python/tests/start_ignite.sh
@@ -20,3 +20,7 @@ SCRIPT_PATH="$( cd "$(dirname "$0")" ; pwd -P )"
 # Start Apache Ignite with plain client listener.
 docker run -itd --name ignite-plain -p 42300:10800 \
 -v ${SCRIPT_PATH}:/data apacheignite/ignite:${IGNITE_VERSION} /data/bin/start-plain.sh
+
+# Start Apache Ignite with IGFS.
+docker run -itd --name ignite-igfs -p 10500:10500 \
+-v ${SCRIPT_PATH}:/data apacheignite/ignite:${IGNITE_VERSION} /data/bin/start-igfs.sh
\ No newline at end of file
diff --git a/tensorflow/contrib/ignite/python/tests/stop_ignite.sh b/tensorflow/contrib/ignite/python/tests/stop_ignite.sh
index 8f03dbd1ede..35b0f32d1b3 100755
--- a/tensorflow/contrib/ignite/python/tests/stop_ignite.sh
+++ b/tensorflow/contrib/ignite/python/tests/stop_ignite.sh
@@ -15,5 +15,4 @@
 # ==============================================================================
 
 docker rm -f ignite-plain
-docker rm -f ignite-ssl
-docker rm -f ignite-ssl-auth
+docker rm -f ignite-igfs
\ No newline at end of file
diff --git a/tensorflow/contrib/labeled_tensor/BUILD b/tensorflow/contrib/labeled_tensor/BUILD
index c8812d4b23f..588f15b867c 100644
--- a/tensorflow/contrib/labeled_tensor/BUILD
+++ b/tensorflow/contrib/labeled_tensor/BUILD
@@ -70,7 +70,10 @@ py_test(
         "python/ops/core_test.py",
     ],
     srcs_version = "PY2AND3",
-    tags = ["no_windows"],  # TODO: needs investigation on Windows
+    tags = [
+        "no_windows",  # TODO: needs investigation on Windows
+        "noasan",  # TODO(b/119323169)
+    ],
     deps = [
         ":_typecheck",
         ":core",
diff --git a/tensorflow/contrib/makefile/tf_op_files.txt b/tensorflow/contrib/makefile/tf_op_files.txt
index eab93f2cc5e..e779eff6890 100644
--- a/tensorflow/contrib/makefile/tf_op_files.txt
+++ b/tensorflow/contrib/makefile/tf_op_files.txt
@@ -42,6 +42,7 @@ tensorflow/core/kernels/conv_grad_filter_ops.cc
 tensorflow/core/kernels/conv_grad_input_ops.cc
 tensorflow/core/kernels/conv_grad_ops.cc
 tensorflow/core/kernels/conv_ops.cc
+tensorflow/core/kernels/conv_ops_3d.cc
 tensorflow/core/kernels/conv_ops_fused.cc
 tensorflow/core/kernels/conv_ops_using_gemm.cc
 tensorflow/core/kernels/crop_and_resize_op.cc
@@ -163,6 +164,7 @@ tensorflow/core/kernels/pack_op.cc
 tensorflow/core/kernels/pad_op.cc
 tensorflow/core/kernels/padding_fifo_queue.cc
 tensorflow/core/kernels/padding_fifo_queue_op.cc
+tensorflow/core/kernels/pooling_ops_3d.cc
 tensorflow/core/kernels/pooling_ops_common.cc
 tensorflow/core/kernels/population_count_op.cc
 tensorflow/core/kernels/quantization_utils.cc
@@ -248,6 +250,7 @@ tensorflow/core/kernels/spectrogram_op.cc
 tensorflow/core/kernels/split_lib_cpu.cc
 tensorflow/core/kernels/split_op.cc
 tensorflow/core/kernels/split_v_op.cc
+tensorflow/core/kernels/stack.cc
 tensorflow/core/kernels/stack_ops.cc
 tensorflow/core/kernels/strided_slice_op.cc
 tensorflow/core/kernels/strided_slice_op_inst_0.cc
diff --git a/tensorflow/contrib/model_pruning/README.md b/tensorflow/contrib/model_pruning/README.md
index b313024e285..45a60d79482 100644
--- a/tensorflow/contrib/model_pruning/README.md
+++ b/tensorflow/contrib/model_pruning/README.md
@@ -51,7 +51,7 @@ The pruning library allows for specification of the following hyper parameters:
 | begin_pruning_step | integer | 0 | The global step at which to begin pruning |
 | end_pruning_step   | integer | -1 | The global step at which to terminate pruning. Defaults to -1 implying that pruning continues till  the training stops |
 | weight_sparsity_map | list of strings | [""] | list of weight variable name (or layer name):target sparsity pairs. Eg. [conv1:0.9,conv2/kernel:0.8]. For layers/weights not in this list, sparsity as specified by the target_sparsity hyperparameter is used. |
-| threshold_decay | float | 0.9 | The decay factor to use for exponential decay of the thresholds |
+| threshold_decay | float | 0.0 | The decay factor to use for exponential decay of the thresholds |
 | pruning_frequency | integer | 10 | How often should the masks be updated? (in # of global_steps) |
 | nbins | integer | 256 | Number of bins to use for histogram computation. Note: When running on TPUs, a large (>1024) value for `nbins` may adversely affect the training time. |
 | block_height|integer | 1 | Number of rows in a block for block sparse matrices|
diff --git a/tensorflow/contrib/model_pruning/python/pruning.py b/tensorflow/contrib/model_pruning/python/pruning.py
index d2b81164176..f6b4373edd0 100644
--- a/tensorflow/contrib/model_pruning/python/pruning.py
+++ b/tensorflow/contrib/model_pruning/python/pruning.py
@@ -204,7 +204,7 @@ def get_pruning_hparams():
       begin_pruning_step=0,
       end_pruning_step=-1,
       weight_sparsity_map=[''],
-      threshold_decay=0.9,
+      threshold_decay=0.0,
       pruning_frequency=10,
       nbins=256,
       block_height=1,
@@ -456,13 +456,14 @@ class Pruning(object):
 
       pool_window = [self._block_dim[0], self._block_dim[1]]
       pool_fn = pruning_utils.factorized_pool
-
+      squeeze_axis = None
       if not self._spec.use_tpu:
         pool_fn = nn_ops.pool
         abs_weights = array_ops.reshape(
             abs_weights,
             [1, abs_weights.get_shape()[0],
              abs_weights.get_shape()[1], 1])
+        squeeze_axis = [0, 3]
 
       pooled_weights = pool_fn(
           abs_weights,
@@ -473,7 +474,7 @@ class Pruning(object):
           name=weights.op.name + '_pooled')
 
       if pooled_weights.get_shape().ndims != 2:
-        pooled_weights = array_ops.squeeze(pooled_weights)
+        pooled_weights = array_ops.squeeze(pooled_weights, axis=squeeze_axis)
 
       smoothed_threshold, new_mask = self._update_mask(pooled_weights,
                                                        threshold)
diff --git a/tensorflow/contrib/model_pruning/python/pruning_utils.py b/tensorflow/contrib/model_pruning/python/pruning_utils.py
index 91b0bb7f600..14fc51229ab 100644
--- a/tensorflow/contrib/model_pruning/python/pruning_utils.py
+++ b/tensorflow/contrib/model_pruning/python/pruning_utils.py
@@ -188,7 +188,6 @@ def _histogram(values, value_range, nbins=100, dtype=dtypes.int32, name=None):
   with ops.name_scope(name, 'histogram', [values, value_range, nbins]) as scope:
     values = ops.convert_to_tensor(values, name='values')
     values = array_ops.reshape(values, [-1])
-    value_range = ops.convert_to_tensor(value_range, name='value_range')
     nbins_float = np.float32(nbins)
 
     # Map tensor values that fall within value_range to [0, 1].
@@ -250,7 +249,6 @@ def compute_cdf(values, value_range, **kwargs):
   name = kwargs.get('name', None)
   with ops.name_scope(name, 'cdf', [values, value_range, nbins]):
     values = ops.convert_to_tensor(values, name='values')
-    value_range = ops.convert_to_tensor(value_range, name='value_range')
     nbins_float = np.float32(nbins)
 
     # Map tensor values that fall within value_range to [0, 1].
@@ -336,7 +334,7 @@ def factorized_pool(input_tensor,
         padding=padding)
 
   return array_ops.squeeze(
-      array_ops.transpose(width_pooling, perm=[0, 1, 3, 2]))
+      array_ops.transpose(width_pooling, perm=[0, 1, 3, 2]), axis=[0, 1])
 
 
 def determine_partitioned_axis(partitioned_variable):
diff --git a/tensorflow/contrib/model_pruning/python/pruning_utils_test.py b/tensorflow/contrib/model_pruning/python/pruning_utils_test.py
index 0aca8434976..d6f2bfcb6c2 100644
--- a/tensorflow/contrib/model_pruning/python/pruning_utils_test.py
+++ b/tensorflow/contrib/model_pruning/python/pruning_utils_test.py
@@ -85,8 +85,28 @@ class PruningUtilsTest(test.TestCase):
 
 
 @parameterized.named_parameters(
-    ("1x1", [1, 1]), ("4x4", [4, 4]), ("6x6", [6, 6]), ("1x4", [1, 4]),
-    ("4x1", [4, 1]), ("1x8", [1, 8]), ("8x1", [8, 1]))
+    ("Input_32x32_block_1x1", [32, 32], [1, 1]),
+    # block size 6x6
+    ("Input_3x3_block_6x6", [3, 3], [6, 6]),
+    ("Input_32x32_block_6x6", [32, 32], [6, 6]),
+    ("Input_2x32_block_6x6", [2, 32], [6, 6]),
+    ("Input_32x2_block_6x6", [32, 2], [6, 6]),
+    ("Input_30x30_block_6x6", [30, 30], [6, 6]),
+    # block size 4x4
+    ("Input_32x32_block_4x4", [32, 32], [4, 4]),
+    ("Input_2x32_block_4x4", [2, 32], [4, 4]),
+    ("Input_32x2_block_4x4", [32, 2], [4, 4]),
+    ("Input_30x30_block_4x4", [30, 30], [4, 4]),
+    # block size 1x4
+    ("Input_32x32_block_1x4", [32, 32], [1, 4]),
+    ("Input_2x32_block_1x4", [2, 32], [1, 4]),
+    ("Input_32x2_block_1x4", [32, 2], [1, 4]),
+    ("Input_30x30_block_1x4", [30, 30], [1, 4]),
+    # block size 4x1
+    ("Input_32x32_block_4x1", [32, 32], [4, 1]),
+    ("Input_2x32_block_4x1", [2, 32], [4, 1]),
+    ("Input_32x2_block_4x1", [32, 2], [4, 1]),
+    ("Input_30x30_block_4x1", [30, 30], [4, 1]))
 class PruningUtilsParameterizedTest(test.TestCase, parameterized.TestCase):
 
   def _compare_pooling_methods(self, weights, pooling_kwargs):
@@ -97,9 +117,11 @@ class PruningUtilsParameterizedTest(test.TestCase, parameterized.TestCase):
               array_ops.reshape(
                   weights,
                   [1, weights.get_shape()[0],
-                   weights.get_shape()[1], 1]), **pooling_kwargs))
+                   weights.get_shape()[1], 1]), **pooling_kwargs),
+          axis=[0, 3])
       pooled_weights_factorized_pool = pruning_utils.factorized_pool(
           weights, **pooling_kwargs)
+
       self.assertAllClose(pooled_weights_tf.eval(),
                           pooled_weights_factorized_pool.eval())
 
@@ -113,8 +135,8 @@ class PruningUtilsParameterizedTest(test.TestCase, parameterized.TestCase):
           [expanded_tensor, kronecker_product])
       self.assertAllEqual(expanded_tensor_val, kronecker_product_val)
 
-  def testFactorizedAvgPool(self, window_shape):
-    weights = variable_scope.get_variable("weights", shape=[1024, 2048])
+  def testFactorizedAvgPool(self, input_shape, window_shape):
+    weights = variable_scope.get_variable("weights", shape=input_shape)
     pooling_kwargs = {
         "window_shape": window_shape,
         "pooling_type": "AVG",
@@ -123,8 +145,8 @@ class PruningUtilsParameterizedTest(test.TestCase, parameterized.TestCase):
     }
     self._compare_pooling_methods(weights, pooling_kwargs)
 
-  def testFactorizedMaxPool(self, window_shape):
-    weights = variable_scope.get_variable("weights", shape=[1024, 2048])
+  def testFactorizedMaxPool(self, input_shape, window_shape):
+    weights = variable_scope.get_variable("weights", shape=input_shape)
     pooling_kwargs = {
         "window_shape": window_shape,
         "pooling_type": "MAX",
@@ -133,8 +155,8 @@ class PruningUtilsParameterizedTest(test.TestCase, parameterized.TestCase):
     }
     self._compare_pooling_methods(weights, pooling_kwargs)
 
-  def testExpandTensor(self, block_dim):
-    weights = random_ops.random_normal(shape=[1024, 512])
+  def testExpandTensor(self, input_shape, block_dim):
+    weights = random_ops.random_normal(shape=input_shape)
     self._compare_expand_tensor_with_kronecker_product(weights, block_dim)
 
 
diff --git a/tensorflow/contrib/opt/python/training/moving_average_optimizer.py b/tensorflow/contrib/opt/python/training/moving_average_optimizer.py
index 9ce50bfe105..b7fd2d2fb9d 100644
--- a/tensorflow/contrib/opt/python/training/moving_average_optimizer.py
+++ b/tensorflow/contrib/opt/python/training/moving_average_optimizer.py
@@ -106,6 +106,32 @@ class MovingAverageOptimizer(optimizer.Optimizer):
       self._swapped_variable_name_map[v_avg.op.name] = v.op.name
     return control_flow_ops.group(train_op, ma_op, name='train_with_avg')
 
+  def _find_swapped_variable(self, v_name_to_tensor, v_name, tensor):
+    """Returns name of swapped variable for given tensor.
+
+    Args:
+      v_name_to_tensor: Mapping from variable names to tensors.
+      v_name: name of the variable for which swapped variable should be returned
+      tensor: Tensor which correspond to variable for which swapped variable
+        should be returned.
+
+    Returns:
+      Tensor which correspond to swapped variable.
+
+    Raises:
+      ValueError: If swapped variable could not be found in v_name_to_tensor.
+    """
+    swapped_v_name = self._swapped_variable_name_map.get(v_name, None)
+    if swapped_v_name is None:
+      return tensor
+    else:
+      if swapped_v_name in v_name_to_tensor:
+        return v_name_to_tensor[swapped_v_name]
+      else:
+        raise ValueError(
+            ('Variable to swap %s is not part of variables to save. '
+             'This breaks MovingAverageOptimizer.') % swapped_v_name)
+
   def swapping_saver(self, var_list=None, name='swapping_saver', **kwargs):
     """Create a saver swapping moving averages and variables.
 
@@ -141,33 +167,33 @@ class MovingAverageOptimizer(optimizer.Optimizer):
     if not isinstance(var_list, dict):
       var_list = saver.BaseSaverBuilder.OpListToDict(var_list)
 
-    # OpListToDict converts variables to tensors. We make sure we can get
-    # the unique variable name for normal and resource vaiables.
-    def get_v_name(tensor):
-      if tensor.op.type == 'ReadVariableOp':
-        return tensor.op.inputs[0].op.name
-      else:
-        return tensor.op.name
-
     v_name_to_tensor = {}
-    for tensor in six.itervalues(var_list):
-      v_name = get_v_name(tensor)
-      v_name_to_tensor[v_name] = tensor
+    for k, tensor_or_list in six.iteritems(var_list):
+      # For each partitioned variable OpListToDict returns list of constituent
+      # parts instead of single tensor.
+      if (isinstance(tensor_or_list, list)
+          or isinstance(tensor_or_list, variables.PartitionedVariable)):
+        for tensor in tensor_or_list:
+          v_name = tensor.op.name
+          v_name_to_tensor[v_name] = tensor
+      else:
+        v_name_to_tensor[k] = tensor_or_list
 
     # Now swap variables and moving averages
     swapped_var_list = {}
-    for k, tensor in six.iteritems(var_list):
-      v_name = get_v_name(tensor)
-      swapped_v_name = self._swapped_variable_name_map.get(v_name, None)
-      tensor_to_save = tensor
-      if swapped_v_name is not None:
-        if swapped_v_name in v_name_to_tensor:
-          tensor_to_save = v_name_to_tensor[swapped_v_name]
-        else:
-          raise ValueError(
-              ('Variable to swap %s is not part of variables to save. '
-               'This breaks MovingAverageOptimizer.') % swapped_v_name)
-      swapped_var_list[k] = tensor_to_save
+    for k, tensor_or_list in six.iteritems(var_list):
+      if isinstance(tensor_or_list, list):
+        tensor_list_to_save = []
+        for tensor in tensor_or_list:
+          v_name = tensor.op.name
+          swapped_variable = self._find_swapped_variable(v_name_to_tensor,
+                                                         v_name,
+                                                         tensor)
+          tensor_list_to_save.append(swapped_variable)
+        swapped_var_list[k] = tensor_list_to_save
+      else:
+        swapped_var_list[k] = self._find_swapped_variable(
+            v_name_to_tensor, k, tensor_or_list)
 
     # Build the swapping saver.
     return saver.Saver(swapped_var_list, name=name, **kwargs)
diff --git a/tensorflow/contrib/opt/python/training/moving_average_optimizer_test.py b/tensorflow/contrib/opt/python/training/moving_average_optimizer_test.py
index f22e7245285..643403eea6f 100644
--- a/tensorflow/contrib/opt/python/training/moving_average_optimizer_test.py
+++ b/tensorflow/contrib/opt/python/training/moving_average_optimizer_test.py
@@ -26,6 +26,8 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import partitioned_variables
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
@@ -43,97 +45,171 @@ class MovingAverageOptimizerTest(test.TestCase):
     # Test that MovingAverageOptimizer works with resource variables.
     self._helpTestRun(use_resource=True)
 
-  def _helpTestRun(self, use_resource=False):
+  def testRunUsePartitionedVars(self):
+    # Test that MovingAverageOptimizer works with partitioned variables.
+    self._helpTestRun(use_partitioned_vars=True)
+
+  def testRunUseResourcePartitionedVars(self):
+    # Test that MovingAverageOptimizer works with resource and partitioned
+    # variables.
+    self._helpTestRun(use_partitioned_vars=True, use_resource=True)
+
+  def _helpTestRun(self, use_resource=False, use_partitioned_vars=False):
+    # Partitioned variables are represented as a "collection" of partitions.
+    # To simplify the test and reuse as much code as possible we employ
+    # following test strategy for partitioned variables.
+    #
+    # In the case of non-partitioned variables test runs on variables with
+    # shape [2].
+    #
+    # In the case of partitioned variables we use shape [4] with two partitions,
+    # thus each partition has shape [2].
+    # For partitioned variables the test is run twice (for loop over
+    # variable_part_names), first time on the first partition of each variable,
+    # second time on the second partition of each variable.
+    variable_part_names = ['part_0', 'part_1'] if use_partitioned_vars else ['']
     for sequential_update in [True, False]:
       for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
-        with self.session(graph=ops.Graph()) as sess:
-          orig_val0 = [1.0, 2.0]
-          orig_val1 = [3.0, 4.0]
-          var0 = variable_scope.get_variable(
-              'var0',
-              initializer=constant_op.constant(orig_val0, dtype=dtype),
-              use_resource=use_resource)
-          var1 = variable_scope.get_variable(
-              'var1',
-              initializer=constant_op.constant(orig_val1, dtype=dtype),
-              use_resource=use_resource)
-          grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
-          grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
+        for var_part_name in variable_part_names:
+          with self.session(graph=ops.Graph()) as sess:
+            orig_val0 = [1.0, 2.0]
+            orig_val1 = [3.0, 4.0]
+            grads0 = [0.1, 0.1]
+            grads1 = [0.01, 0.01]
+            if use_partitioned_vars:
+              # Use partitioned variables.
+              # Create partitioned and duplicate each value used as initial
+              # value of variables.
+              partitioner = partitioned_variables.fixed_size_partitioner(
+                  num_shards=2)
+              orig_val0 = orig_val0 * 2
+              orig_val1 = orig_val1 * 2
+              grads0 = grads0 * 2
+              grads1 = grads1 * 2
+            else:
+              # Regular (non-partitioned) variables.
+              partitioner = None
+            var0 = variable_scope.get_variable(
+                'var0',
+                initializer=constant_op.constant(orig_val0, dtype=dtype),
+                use_resource=use_resource,
+                partitioner=partitioner)
+            var1 = variable_scope.get_variable(
+                'var1',
+                initializer=constant_op.constant(orig_val1, dtype=dtype),
+                use_resource=use_resource,
+                partitioner=partitioner)
+            # Make a fake loss, such that gradient(loss, var0) == grads0
+            # and gradient(loss, var1) == grads1
+            grads0 = constant_op.constant(grads0, dtype=dtype)
+            grads1 = constant_op.constant(grads1, dtype=dtype)
+            loss = (math_ops.reduce_sum(grads0 * var0)
+                    + math_ops.reduce_sum(grads1 * var1))
 
-          opt = moving_average_optimizer.MovingAverageOptimizer(
-              gradient_descent.GradientDescentOptimizer(learning_rate=2.0),
-              average_decay=0.5,
-              sequential_update=sequential_update)
-          save_dir = tempfile.mkdtemp(
-              prefix=os.path.join(self.get_temp_dir(), 'run_1'))
-          save_path = os.path.join(save_dir, 'model')
-          update = opt.apply_gradients(
-              list(six.moves.zip([grads0, grads1], [var0, var1])))
-          global_vars = variables.global_variables()
-          ema_var0 = [
-              v for v in global_vars
-              if v.op.name == 'var0/ExponentialMovingAverage'
-          ][0]
-          ema_var1 = [
-              v for v in global_vars
-              if v.op.name == 'var1/ExponentialMovingAverage'
-          ][0]
-          perturb = control_flow_ops.group([
-              state_ops.assign_add(var0, [1.0, 1.0]),
-              state_ops.assign_add(var1, [2.0, 2.0]),
-              state_ops.assign_add(ema_var0, [3.0, 3.0]),
-              state_ops.assign_add(ema_var1, [4.0, 4.0])
-          ])
+            opt = moving_average_optimizer.MovingAverageOptimizer(
+                gradient_descent.GradientDescentOptimizer(learning_rate=2.0),
+                average_decay=0.5,
+                sequential_update=sequential_update)
+            save_dir = tempfile.mkdtemp(
+                prefix=os.path.join(self.get_temp_dir(), 'run_1'))
+            save_path = os.path.join(save_dir, 'model')
 
-          # Test that saver with missing ema variables will fail.
-          with self.assertRaisesRegexp(ValueError, r'Variable to swap'):
-            opt.swapping_saver(var_list=[var0])
+            update = opt.minimize(loss)
 
-          train_saver = opt.swapping_saver()
-          train_saver_subset = opt.swapping_saver(var_list=[var0, ema_var0])
-          inference_saver = saver.Saver()
-          variables.global_variables_initializer().run()
-          # Step 1.
-          update.run()
-          self.assertAllCloseAccordingToType([0.8, 1.8], var0.eval())
-          self.assertAllCloseAccordingToType([2.98, 3.98], var1.eval())
-          if sequential_update:
-            self.assertAllCloseAccordingToType([0.9, 1.9], ema_var0.eval())
-            self.assertAllCloseAccordingToType([2.99, 3.99], ema_var1.eval())
-          # Test that the swapping saver save/restore operation is identity.
-          train_saver.save(sess, save_path)
-          train_saver.restore(sess, save_path)
-          self.assertAllCloseAccordingToType([0.8, 1.8], var0.eval())
-          self.assertAllCloseAccordingToType([2.98, 3.98], var1.eval())
-          if sequential_update:
-            self.assertAllCloseAccordingToType([0.9, 1.9], ema_var0.eval())
-            self.assertAllCloseAccordingToType([2.99, 3.99], ema_var1.eval())
-          # Test that the subset saver saves the EMA variable as well.
-          if sequential_update:
-            subset_save_path = save_path + '_subset'
-            train_saver_subset.save(sess, subset_save_path)
-            perturb.run()
-            self.assertAllCloseAccordingToType([1.8, 2.8], var0.eval())
-            self.assertAllCloseAccordingToType([3.9, 4.9], ema_var0.eval())
-            self.assertAllCloseAccordingToType([4.98, 5.98], var1.eval())
-            self.assertAllCloseAccordingToType([6.99, 7.99], ema_var1.eval())
-            # Restoring should only restore var0 and ema_var0.
-            train_saver_subset.restore(sess, subset_save_path)
+            # Get variables and their EMAs. In case of partitioned variables
+            # get proper part of each variable.
+            def _get_variable(var_name, part_name, ema):
+              """Returns variable of it's moving average by name."""
+              matches = [
+                  v for v in variables.global_variables()
+                  if ((var_name in v.op.name)
+                      and (part_name in v.op.name)
+                      and (('ExponentialMovingAverage' in v.op.name) == ema))
+              ]
+              self.assertEqual(len(matches), 1)
+              return matches[0]
+            var0 = _get_variable('var0', var_part_name, ema=False)
+            var1 = _get_variable('var1', var_part_name, ema=False)
+            ema_var0 = _get_variable('var0', var_part_name, ema=True)
+            ema_var1 = _get_variable('var1', var_part_name, ema=True)
+
+            perturb = control_flow_ops.group([
+                state_ops.assign_add(var0, [1.0, 1.0]),
+                state_ops.assign_add(var1, [2.0, 2.0]),
+                state_ops.assign_add(ema_var0, [3.0, 3.0]),
+                state_ops.assign_add(ema_var1, [4.0, 4.0])
+            ])
+
+            # Test that saver with missing ema variables will fail.
+            with self.assertRaisesRegexp(ValueError, r'Variable to swap'):
+              opt.swapping_saver(var_list=[var0])
+
+            train_saver = opt.swapping_saver()
+            train_saver_subset = opt.swapping_saver(var_list=[var0, ema_var0])
+            inference_saver = saver.Saver()
+            variables.global_variables_initializer().run()
+            # Step 1.
+            update.run()
             self.assertAllCloseAccordingToType([0.8, 1.8], var0.eval())
-            self.assertAllCloseAccordingToType([0.9, 1.9], ema_var0.eval())
-            self.assertAllCloseAccordingToType([4.98, 5.98], var1.eval())
-            self.assertAllCloseAccordingToType([6.99, 7.99], ema_var1.eval())
-            # Restore back to previous state.
+            self.assertAllCloseAccordingToType([2.98, 3.98], var1.eval())
+            if sequential_update:
+              self.assertAllCloseAccordingToType([0.9, 1.9], ema_var0.eval())
+              self.assertAllCloseAccordingToType([2.99, 3.99], ema_var1.eval())
+            # Test that the swapping saver save/restore operation is identity.
+            train_saver.save(sess, save_path)
             train_saver.restore(sess, save_path)
+            self.assertAllCloseAccordingToType([0.8, 1.8], var0.eval())
+            self.assertAllCloseAccordingToType([2.98, 3.98], var1.eval())
+            if sequential_update:
+              self.assertAllCloseAccordingToType([0.9, 1.9], ema_var0.eval())
+              self.assertAllCloseAccordingToType([2.99, 3.99], ema_var1.eval())
+            # Test that the subset saver saves the EMA variable as well.
+            if sequential_update:
+              subset_save_path = save_path + '_subset'
+              train_saver_subset.save(sess, subset_save_path)
+              perturb.run()
+              self.assertAllCloseAccordingToType([1.8, 2.8], var0.eval())
+              self.assertAllCloseAccordingToType([3.9, 4.9], ema_var0.eval())
+              self.assertAllCloseAccordingToType([4.98, 5.98], var1.eval())
+              self.assertAllCloseAccordingToType([6.99, 7.99], ema_var1.eval())
+              # Restoring should only restore var0 and ema_var0.
+              train_saver_subset.restore(sess, subset_save_path)
+              self.assertAllCloseAccordingToType([0.8, 1.8], var0.eval())
+              self.assertAllCloseAccordingToType([0.9, 1.9], ema_var0.eval())
+              self.assertAllCloseAccordingToType([4.98, 5.98], var1.eval())
+              self.assertAllCloseAccordingToType([6.99, 7.99], ema_var1.eval())
+              # Restore back to previous state.
+              train_saver.restore(sess, save_path)
 
-          # If updates are parallel, this is not always true after the 1st step.
-          if sequential_update:
+            # If updates are parallel,
+            # this is not always true after the 1st step.
+            if sequential_update:
+              # Test that the normal saver will have the averaged variables.
+              # We test that the average values are between the original value
+              # and the most recent variable values (since they are an average
+              # of the two).
+              val0 = var0.eval()
+              val1 = var1.eval()
+              train_saver.save(sess, save_path)
+              inference_saver.restore(sess, save_path)
+              avg_val0 = var0.eval()
+              avg_val1 = var1.eval()
+              for i in six.moves.range(len(val0)):
+                self.assertLess(val0[i], avg_val0[i])
+                self.assertLess(avg_val0[i], orig_val0[i])
+                self.assertLess(val1[i], avg_val1[i])
+                self.assertLess(avg_val1[i], orig_val1[i])
+              train_saver.restore(sess, save_path)
+            # Step 2.
+            update.run()
             # Test that the normal saver will have the averaged variables.
-            # We test that the average values are between the original value
-            # and the most recent variable values (since they are an average
-            # of the two).
+            # We test that the average values are between the original value and
+            # the most recent variable values (since they are an average of the
+            # two).
             val0 = var0.eval()
             val1 = var1.eval()
+            self.assertAllCloseAccordingToType([0.6, 1.6], val0)
+            self.assertAllCloseAccordingToType([2.96, 3.96], val1)
             train_saver.save(sess, save_path)
             inference_saver.restore(sess, save_path)
             avg_val0 = var0.eval()
@@ -143,26 +219,6 @@ class MovingAverageOptimizerTest(test.TestCase):
               self.assertLess(avg_val0[i], orig_val0[i])
               self.assertLess(val1[i], avg_val1[i])
               self.assertLess(avg_val1[i], orig_val1[i])
-            train_saver.restore(sess, save_path)
-          # Step 2.
-          update.run()
-          # Test that the normal saver will have the averaged variables.
-          # We test that the average values are between the original value and
-          # the most recent variable values (since they are an average of the
-          # two).
-          val0 = var0.eval()
-          val1 = var1.eval()
-          self.assertAllCloseAccordingToType([0.6, 1.6], val0)
-          self.assertAllCloseAccordingToType([2.96, 3.96], val1)
-          train_saver.save(sess, save_path)
-          inference_saver.restore(sess, save_path)
-          avg_val0 = var0.eval()
-          avg_val1 = var1.eval()
-          for i in six.moves.range(len(val0)):
-            self.assertLess(val0[i], avg_val0[i])
-            self.assertLess(avg_val0[i], orig_val0[i])
-            self.assertLess(val1[i], avg_val1[i])
-            self.assertLess(avg_val1[i], orig_val1[i])
 
   def testFailWhenSaverCreatedBeforeInitialized(self):
     with self.cached_session():
diff --git a/tensorflow/contrib/opt/python/training/nadam_optimizer.py b/tensorflow/contrib/opt/python/training/nadam_optimizer.py
index 44a8890cb10..155ff5b3f4f 100644
--- a/tensorflow/contrib/opt/python/training/nadam_optimizer.py
+++ b/tensorflow/contrib/opt/python/training/nadam_optimizer.py
@@ -1,4 +1,4 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/tensorflow/contrib/optimizer_v2/optimizer_v2.py b/tensorflow/contrib/optimizer_v2/optimizer_v2.py
index f789c83e005..467dd86d8fd 100644
--- a/tensorflow/contrib/optimizer_v2/optimizer_v2.py
+++ b/tensorflow/contrib/optimizer_v2/optimizer_v2.py
@@ -790,14 +790,7 @@ class OptimizerV2(optimizer_v1.Optimizer):
         # Scale loss for number of replicas (callable-loss case). In this case,
         # we have to be careful to call distribute_lib.get_loss_reduction()
         # *after* loss() is evaluated, so we know what loss reduction it uses.
-        if scale_loss_by_num_replicas is None:
-          scale_loss_by_num_replicas = (
-              distribute_lib.get_loss_reduction() == variable_scope
-              .VariableAggregation.MEAN)
-        if scale_loss_by_num_replicas:
-          num_replicas = distribute_ctx.get_distribution_strategy().num_replicas
-          if num_replicas > 1:
-            loss_value *= 1. / num_replicas
+        loss_value = self._scale_loss(loss_value, scale_loss_by_num_replicas)
 
       if var_list is None:
         var_list = tape.watched_variables()
@@ -808,14 +801,7 @@ class OptimizerV2(optimizer_v1.Optimizer):
                          "be a function when eager execution is enabled.")
 
     # Scale loss for number of replicas (non-callable-loss case).
-    if scale_loss_by_num_replicas is None:
-      scale_loss_by_num_replicas = (
-          distribute_lib.get_loss_reduction() == variable_scope
-          .VariableAggregation.MEAN)
-    if scale_loss_by_num_replicas:
-      num_replicas = distribute_ctx.get_distribution_strategy().num_replicas
-      if num_replicas > 1:
-        loss *= 1. / num_replicas
+    loss = self._scale_loss(loss, scale_loss_by_num_replicas)
 
     if gate_gradients not in [
         optimizer_v1.Optimizer.GATE_NONE, optimizer_v1.Optimizer.GATE_OP,
@@ -857,6 +843,20 @@ class OptimizerV2(optimizer_v1.Optimizer):
     ])
     return grads_and_vars
 
+  @staticmethod
+  def _scale_loss(loss_value, scale_loss_by_num_replicas):
+    """Scale loss for the number of replicas."""
+    if scale_loss_by_num_replicas is None:
+      scale_loss_by_num_replicas = (
+          distribute_lib.get_loss_reduction() == variable_scope
+          .VariableAggregation.MEAN)
+    if scale_loss_by_num_replicas:
+      num_replicas = \
+        distribute_ctx.get_distribution_strategy().num_replicas_in_sync
+      if num_replicas > 1:
+        loss_value *= 1. / num_replicas
+    return loss_value
+
   def apply_gradients(self, grads_and_vars, global_step=None, name=None):
     """Apply gradients to variables.
 
diff --git a/tensorflow/contrib/rnn/kernels/lstm_ops_gpu.cu.cc b/tensorflow/contrib/rnn/kernels/lstm_ops_gpu.cu.cc
index 057e851aba6..15ae95f13cf 100644
--- a/tensorflow/contrib/rnn/kernels/lstm_ops_gpu.cu.cc
+++ b/tensorflow/contrib/rnn/kernels/lstm_ops_gpu.cu.cc
@@ -141,7 +141,7 @@ __global__ void lstm_gates(const T* icfo, const T* b, const T* cs_prev,
   //
   const int gid = batch_id * cell_size * 4 + act_id;
   const int cid = batch_id * cell_size + act_id;
-  Eigen::internal::scalar_sigmoid_op<T> sigmoid_op;
+  Eigen::internal::scalar_logistic_op<T> sigmoid_op;
   Eigen::internal::scalar_tanh_op<T> tanh_op;
   Eigen::scalar_clip_op<T> clip_op;
 
diff --git a/tensorflow/contrib/saved_model/BUILD b/tensorflow/contrib/saved_model/BUILD
index 291ff83791c..f0947fe423f 100644
--- a/tensorflow/contrib/saved_model/BUILD
+++ b/tensorflow/contrib/saved_model/BUILD
@@ -82,7 +82,6 @@ py_library(
     name = "keras_saved_model",
     srcs = ["python/saved_model/keras_saved_model.py"],
     srcs_version = "PY2AND3",
-    tags = ["no_windows"],
     visibility = ["//visibility:public"],
     deps = [
         "//tensorflow/python:array_ops",
@@ -103,7 +102,7 @@ py_test(
     size = "medium",
     srcs = ["python/saved_model/keras_saved_model_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["notsan"],
+    tags = ["no_windows"],
     deps = [
         ":keras_saved_model",
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/contrib/saved_model/python/saved_model/keras_saved_model_test.py b/tensorflow/contrib/saved_model/python/saved_model/keras_saved_model_test.py
index 4970ebc3199..a65b2ce4661 100644
--- a/tensorflow/contrib/saved_model/python/saved_model/keras_saved_model_test.py
+++ b/tensorflow/contrib/saved_model/python/saved_model/keras_saved_model_test.py
@@ -345,21 +345,22 @@ class TestModelSavedModelExport(test.TestCase, parameterized.TestCase):
         inputs, outputs = load_model(sess, output_path,
                                      model_fn_lib.ModeKeys.EVAL)
 
-        sess.run(outputs['metrics/mae/update_op'], {
-            inputs[input_name]: input_arr,
-            inputs[target_name]: target_arr
-        })
+        # First obtain the loss and predictions, and run the metric update op by
+        # feeding in the inputs and targets.
+        loss, predictions, _ = sess.run(
+            (outputs['loss'], outputs['predictions/' + output_name],
+             outputs['metrics/mae/update_op']),
+            {inputs[input_name]: input_arr, inputs[target_name]: target_arr})
 
-        eval_results = sess.run(outputs, {inputs[input_name]: input_arr,
-                                          inputs[target_name]: target_arr})
+        # The metric value should be run after the update op, to ensure that it
+        # reflects the correct value.
+        metric_value = sess.run(outputs['metrics/mae/value'])
 
         self.assertEqual(int(train_before_export),
                          sess.run(training_module.get_global_step()))
-        self.assertAllClose(ref_loss, eval_results['loss'], atol=1e-05)
-        self.assertAllClose(
-            ref_mae, eval_results['metrics/mae/value'], atol=1e-05)
-        self.assertAllClose(
-            ref_predict, eval_results['predictions/' + output_name], atol=1e-05)
+        self.assertAllClose(ref_loss, loss, atol=1e-05)
+        self.assertAllClose(ref_mae, metric_value, atol=1e-05)
+        self.assertAllClose(ref_predict, predictions, atol=1e-05)
 
       # Load train graph, and check for the train op, and prediction values
       with session.Session(graph=ops.Graph()) as sess:
diff --git a/tensorflow/contrib/signal/BUILD b/tensorflow/contrib/signal/BUILD
index 6bd58c4d322..5e4f130b314 100644
--- a/tensorflow/contrib/signal/BUILD
+++ b/tensorflow/contrib/signal/BUILD
@@ -4,129 +4,11 @@ licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE"])
 
-load("//tensorflow:tensorflow.bzl", "cuda_py_tests")
-load("//tensorflow:tensorflow.bzl", "py_test")  # @unused
-
 py_library(
     name = "signal_py",
-    srcs = ["__init__.py"] + glob(["python/ops/*.py"]),
+    srcs = ["__init__.py"],
     srcs_version = "PY2AND3",
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:spectral_ops",
-        "//tensorflow/python:tensor_util",
-        "//tensorflow/python:util",
-        "//third_party/py/numpy",
-    ],
-)
-
-py_library(
-    name = "test_util",
-    srcs = ["python/kernel_tests/test_util.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:tf_optimizer",
-        "//tensorflow/python:training",
-    ],
-)
-
-cuda_py_tests(
-    name = "mel_ops_test",
-    srcs = ["python/kernel_tests/mel_ops_test.py"],
-    additional_deps = [
-        ":signal_py",
-        ":test_util",
-        "//third_party/py/numpy",
-        "//tensorflow/python:client_testlib",
-    ],
-)
-
-cuda_py_tests(
-    name = "mfcc_ops_test",
-    srcs = ["python/kernel_tests/mfcc_ops_test.py"],
-    additional_deps = [
-        ":signal_py",
-        "//third_party/py/numpy",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:spectral_ops_test_util",
-    ],
-)
-
-cuda_py_tests(
-    name = "reconstruction_ops_test",
-    srcs = ["python/kernel_tests/reconstruction_ops_test.py"],
-    additional_deps = [
-        ":signal_py",
-        "//third_party/py/numpy",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:gradients",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform_test",
-    ],
-)
-
-cuda_py_tests(
-    name = "shape_ops_test",
-    srcs = ["python/kernel_tests/shape_ops_test.py"],
-    additional_deps = [
-        ":signal_py",
-        ":test_util",
-        "//third_party/py/numpy",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform_test",
-    ],
-)
-
-cuda_py_tests(
-    name = "spectral_ops_test",
-    size = "large",
-    srcs = ["python/kernel_tests/spectral_ops_test.py"],
-    additional_deps = [
-        ":signal_py",
-        "//third_party/py/numpy",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:gradients",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:random_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform_test",
-        "//tensorflow/python:spectral_ops_test_util",
-    ],
-    tags = ["nomac"],
-)
-
-cuda_py_tests(
-    name = "window_ops_test",
-    srcs = ["python/kernel_tests/window_ops_test.py"],
-    additional_deps = [
-        ":signal_py",
-        ":test_util",
-        "//third_party/py/numpy",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform_test",
+        "//tensorflow/python/ops/signal",
     ],
 )
diff --git a/tensorflow/contrib/signal/__init__.py b/tensorflow/contrib/signal/__init__.py
index d088e744346..d01f5ccf51c 100644
--- a/tensorflow/contrib/signal/__init__.py
+++ b/tensorflow/contrib/signal/__init__.py
@@ -14,6 +14,9 @@
 # ==============================================================================
 """Signal processing operations.
 
+`tf.contrib.signal` has been renamed to `tf.signal`. `tf.contrib.signal` will be
+removed in TensorFlow 2.0.
+
 See the
 [Contrib Signal](https://tensorflow.org/api_guides/python/contrib.signal)
 guide.
@@ -39,18 +42,20 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.signal.python.ops.mel_ops import linear_to_mel_weight_matrix
-from tensorflow.contrib.signal.python.ops.mfcc_ops import mfccs_from_log_mel_spectrograms
-from tensorflow.contrib.signal.python.ops.reconstruction_ops import overlap_and_add
-from tensorflow.contrib.signal.python.ops.shape_ops import frame
-# `frame` used to be named `frames`, which is a noun and not a verb.
-# Keep an alias to `frames` for backwards compatibility.
-from tensorflow.contrib.signal.python.ops.shape_ops import frame as frames
-from tensorflow.contrib.signal.python.ops.spectral_ops import inverse_stft
-from tensorflow.contrib.signal.python.ops.spectral_ops import inverse_stft_window_fn
-from tensorflow.contrib.signal.python.ops.spectral_ops import stft
-from tensorflow.contrib.signal.python.ops.window_ops import hamming_window
-from tensorflow.contrib.signal.python.ops.window_ops import hann_window
+from tensorflow.python.ops.signal.mel_ops import linear_to_mel_weight_matrix
+from tensorflow.python.ops.signal.mfcc_ops import mfccs_from_log_mel_spectrograms
+from tensorflow.python.ops.signal.reconstruction_ops import overlap_and_add
+from tensorflow.python.ops.signal.shape_ops import frame
+from tensorflow.python.ops.signal.spectral_ops import inverse_stft
+from tensorflow.python.ops.signal.spectral_ops import inverse_stft_window_fn
+from tensorflow.python.ops.signal.spectral_ops import stft
+from tensorflow.python.ops.signal.window_ops import hamming_window
+from tensorflow.python.ops.signal.window_ops import hann_window
 
 from tensorflow.python.util.all_util import remove_undocumented
+
+# `frame` used to be named `frames`, which is a noun and not a verb.
+# Keep an alias to `frames` for backwards compatibility.
+frames = frame
+
 remove_undocumented(__name__)
diff --git a/tensorflow/contrib/tensor_forest/python/ops/model_ops.py b/tensorflow/contrib/tensor_forest/python/ops/model_ops.py
index 596c59ead34..290c16fe396 100644
--- a/tensorflow/contrib/tensor_forest/python/ops/model_ops.py
+++ b/tensorflow/contrib/tensor_forest/python/ops/model_ops.py
@@ -17,6 +17,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import functools
+
 from tensorflow.contrib.tensor_forest.python.ops import gen_model_ops
 
 # pylint: disable=unused-import
@@ -28,10 +30,12 @@ from tensorflow.contrib.tensor_forest.python.ops.gen_model_ops import update_mod
 # pylint: enable=unused-import
 
 from tensorflow.contrib.util import loader
+from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import resources
 from tensorflow.python.platform import resource_loader
 from tensorflow.python.training import saver
+from tensorflow.python.training.checkpointable import tracking
 
 
 _model_ops = loader.load_op_library(
@@ -88,6 +92,59 @@ class TreeVariableSavable(saver.BaseSaverBuilder.SaveableObject):
           params=self.params.serialized_params_proto)
 
 
+class TreeVariable(tracking.TrackableResource):
+  """A tree model."""
+
+  def __init__(self, params, tree_config, stats_handle, name, container=None):
+    self._params = params
+    self._tree_config = tree_config
+    self._stats_handle = stats_handle
+    self._name = name
+    self._container = container
+    self._init_op = None
+    super(TreeVariable, self).__init__()
+    self._resource_handle = self.create_resource()
+
+  def create_resource(self):
+    if context.executing_eagerly():
+      # TODO(allenl): This will leak memory due to kernel caching by the
+      # shared_name attribute value (but is better than the alternative of
+      # sharing everything by default when executing eagerly; hopefully creating
+      # tables in a loop is uncommon).
+      shared_name = "tree_variable_%d" % (ops.uid(),)
+    else:
+      shared_name = self._name
+    return gen_model_ops.decision_tree_resource_handle_op(
+        self._container, shared_name=shared_name, name=self._name)
+
+  def initialize(self):
+    return gen_model_ops.create_tree_variable(
+        self.resource_handle,
+        self._tree_config,
+        params=self._params.serialized_params_proto)
+
+  @property
+  def initializer(self):
+    if self._init_op is None:
+      self._init_op = self.initialize()
+    return self._init_op
+
+  def is_initialized(self):
+    return gen_model_ops.tree_is_initialized_op(self.resource_handle)
+
+  def _gather_saveables_for_checkpoint(self):
+    """For object-based checkpointing."""
+    return {
+        "tree_variable":
+            functools.partial(
+                TreeVariableSavable,
+                params=self._params,
+                tree_handle=self.resource_handle,
+                stats_handle=self._stats_handle,
+                create_op=self._init_op)
+    }
+
+
 def tree_variable(params, tree_config, stats_handle, name, container=None):
   r"""Creates a tree model and returns a handle to it.
 
@@ -102,18 +159,13 @@ def tree_variable(params, tree_config, stats_handle, name, container=None):
     A `Tensor` of type mutable `string`. The handle to the tree.
   """
   with ops.name_scope(name, "TreeVariable") as name:
-    resource_handle = gen_model_ops.decision_tree_resource_handle_op(
-        container, shared_name=name, name=name)
-
-    create_op = gen_model_ops.create_tree_variable(
-        resource_handle,
-        tree_config,
-        params=params.serialized_params_proto)
-    is_initialized_op = gen_model_ops.tree_is_initialized_op(resource_handle)
+    tree_var = TreeVariable(params, tree_config, stats_handle, name, container)
+    resource_handle = tree_var.resource_handle
+    create_op = tree_var.initializer
+    is_initialized_op = tree_var.is_initialized()
     # Adds the variable to the savable list.
-    saveable = TreeVariableSavable(params, resource_handle, stats_handle,
-                                   create_op,
-                                   resource_handle.name)
+    saveable = tree_var._gather_saveables_for_checkpoint()["tree_variable"](  # pylint: disable=protected-access
+        name=resource_handle.name)
     ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, saveable)
     resources.register_resource(resource_handle, create_op, is_initialized_op)
     return resource_handle
diff --git a/tensorflow/contrib/tensor_forest/python/ops/stats_ops.py b/tensorflow/contrib/tensor_forest/python/ops/stats_ops.py
index 44d486edecc..9184198cd4c 100644
--- a/tensorflow/contrib/tensor_forest/python/ops/stats_ops.py
+++ b/tensorflow/contrib/tensor_forest/python/ops/stats_ops.py
@@ -17,6 +17,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import functools
+
 from tensorflow.contrib.tensor_forest.python.ops import gen_stats_ops
 # pylint: disable=unused-import
 from tensorflow.contrib.tensor_forest.python.ops.gen_stats_ops import finalize_tree
@@ -25,10 +27,12 @@ from tensorflow.contrib.tensor_forest.python.ops.gen_stats_ops import process_in
 # pylint: enable=unused-import
 
 from tensorflow.contrib.util import loader
+from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import resources
 from tensorflow.python.platform import resource_loader
 from tensorflow.python.training import saver
+from tensorflow.python.training.checkpointable import tracking
 
 
 _stats_ops = loader.load_op_library(
@@ -84,8 +88,58 @@ class FertileStatsVariableSavable(saver.BaseSaverBuilder.SaveableObject):
           params=self.params.serialized_params_proto)
 
 
-def fertile_stats_variable(params, stats_config, name,
-                           container=None):
+class FertileStatsVariable(tracking.TrackableResource):
+  """A Fertile stats variable."""
+
+  def __init__(self, params, stats_config, name, container=None):
+    self._params = params
+    self._stats_config = stats_config
+    self._name = name
+    self._container = container
+    self._init_op = None
+    super(FertileStatsVariable, self).__init__()
+    self._resource_handle = self.create_resource()
+
+  def create_resource(self):
+    if context.executing_eagerly():
+      # TODO(allenl): This will leak memory due to kernel caching by the
+      # shared_name attribute value (but is better than the alternative of
+      # sharing everything by default when executing eagerly; hopefully creating
+      # tables in a loop is uncommon).
+      shared_name = "fertile_stats_variable_%d" % (ops.uid(),)
+    else:
+      shared_name = self._name
+    return gen_stats_ops.fertile_stats_resource_handle_op(
+        self._container, shared_name=shared_name, name=self._name)
+
+  def initialize(self):
+    return gen_stats_ops.create_fertile_stats_variable(
+        self.resource_handle,
+        self._stats_config,
+        params=self._params.serialized_params_proto)
+
+  @property
+  def initializer(self):
+    if self._init_op is None:
+      self._init_op = self.initialize()
+    return self._init_op
+
+  def is_initialized(self):
+    return gen_stats_ops.fertile_stats_is_initialized_op(self.resource_handle)
+
+  def _gather_saveables_for_checkpoint(self):
+    """For object-based checkpointing."""
+    return {
+        "fertile_stats_variable":
+            functools.partial(
+                FertileStatsVariableSavable,
+                params=self._params,
+                stats_handle=self.resource_handle,
+                create_op=self.initializer)
+    }
+
+
+def fertile_stats_variable(params, stats_config, name, container=None):
   r"""Creates a stats object and returns a handle to it.
 
   Args:
@@ -98,17 +152,15 @@ def fertile_stats_variable(params, stats_config, name,
     A `Tensor` of type mutable `string`. The handle to the stats.
   """
   with ops.name_scope(name, "FertileStatsVariable") as name:
-    resource_handle = gen_stats_ops.fertile_stats_resource_handle_op(
-        container, shared_name=name, name=name)
-
-    create_op = gen_stats_ops.create_fertile_stats_variable(
-        resource_handle, stats_config,
-        params=params.serialized_params_proto)
-    is_initialized_op = gen_stats_ops.fertile_stats_is_initialized_op(
-        resource_handle)
+    fertile_stats_var = FertileStatsVariable(params, stats_config, name,
+                                             container)
+    resource_handle = fertile_stats_var.resource_handle
+    create_op = fertile_stats_var.initializer
+    is_initialized_op = fertile_stats_var.is_initialized()
     # Adds the variable to the savable list.
-    saveable = FertileStatsVariableSavable(params, resource_handle, create_op,
-                                           resource_handle.name)
+    saveable = (
+        fertile_stats_var._gather_saveables_for_checkpoint()[  # pylint: disable=protected-access
+            "fertile_stats_variable"](name=resource_handle.name))
     ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, saveable)
     resources.register_resource(resource_handle, create_op, is_initialized_op)
     return resource_handle
diff --git a/tensorflow/contrib/tensorrt/convert/convert_graph.cc b/tensorflow/contrib/tensorrt/convert/convert_graph.cc
index 1f5591fe2a6..26d54eb156c 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_graph.cc
+++ b/tensorflow/contrib/tensorrt/convert/convert_graph.cc
@@ -141,6 +141,7 @@ Status TrtCandidateSelector::IsTensorRTCandidate(const tensorflow::Node* node) {
   std::vector<const Edge*> input_edges;
   TF_RETURN_IF_ERROR(node->input_edges(&input_edges));
   std::vector<std::pair<const NodeDef*, int>> input_node_and_ports;
+  input_node_and_ports.reserve(input_edges.size());
   for (const Edge* input_edge : input_edges) {
     input_node_and_ports.emplace_back(&input_edge->src()->def(),
                                       input_edge->src_output());
@@ -923,7 +924,7 @@ tensorflow::Status ConvertAfterShapes(ConversionParams& params) {
     converted_segments.push_back(std::move(curr_segment));
 
     if (VLOG_IS_ON(8)) {
-      string fname = curr_engine.engine_name;
+      string fname = engine_segments.back().engine_name;
       StrAppend(&fname, ".pb");
       std::fstream f;
       f.open(fname.c_str(), std::fstream::out | std::fstream::binary);
diff --git a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
index a6f954391d3..e2988f5f2a8 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
+++ b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
@@ -1552,6 +1552,7 @@ tensorflow::Status ConvertPlugin(OpConverterParams* params) {
   const auto& node_def = params->node_def;
   // prepare input
   std::vector<nvinfer1::ITensor*> all_inputs;
+  all_inputs.reserve(inputs.size());
   for (auto input : inputs) {
     all_inputs.emplace_back(const_cast<nvinfer1::ITensor*>(input.tensor()));
   }
diff --git a/tensorflow/contrib/tpu/BUILD b/tensorflow/contrib/tpu/BUILD
index 67327d32000..a0a9cb3f31a 100644
--- a/tensorflow/contrib/tpu/BUILD
+++ b/tensorflow/contrib/tpu/BUILD
@@ -246,6 +246,7 @@ py_library(
         "python/tpu/bfloat16.py",
         "python/tpu/device_assignment.py",
         "python/tpu/session_support.py",
+        "python/tpu/tensor_tracer.py",
         "python/tpu/topology.py",
         "python/tpu/tpu.py",
         "python/tpu/tpu_feed.py",
diff --git a/tensorflow/contrib/tpu/profiler/BUILD b/tensorflow/contrib/tpu/profiler/BUILD
index 38d1c3049ef..541fbf33a30 100644
--- a/tensorflow/contrib/tpu/profiler/BUILD
+++ b/tensorflow/contrib/tpu/profiler/BUILD
@@ -94,13 +94,6 @@ tf_proto_library(
     visibility = ["//visibility:public"],
 )
 
-tf_proto_library(
-    name = "tf_op_stats_proto",
-    srcs = ["tf_op_stats.proto"],
-    cc_api_version = 2,
-    visibility = ["//visibility:public"],
-)
-
 tf_proto_library(
     name = "tpu_profiler_analysis_proto",
     srcs = ["tpu_profiler_analysis.proto"],
diff --git a/tensorflow/contrib/tpu/profiler/tf_op_stats.proto b/tensorflow/contrib/tpu/profiler/tf_op_stats.proto
deleted file mode 100644
index 1e66801efd4..00000000000
--- a/tensorflow/contrib/tpu/profiler/tf_op_stats.proto
+++ /dev/null
@@ -1,261 +0,0 @@
-// This proto describes the format of tensorflow operation level stats for
-// profiling (in tensorboard) purpose.
-
-syntax = "proto2";
-
-package tensorflow.tpu;
-
-// Result proto for OpMetrics.
-message OpMetricsResult {
-  // True if this OP is executed on the device; False if it is executed on the
-  // host.
-  optional bool on_device = 1;
-  reserved 2;  // was uint32 id.
-  // Name of this OP.
-  optional string name = 3;
-  // Rank of this OP.
-  optional uint64 rank = 4;
-  // The starting time in cycles of the last instance of this OP executed.
-  optional double last_starttime_in_cycles = 5;
-  // The ending time in cycles of the last instance of this OP executed.
-  optional double last_endtime_in_cycles = 6;
-  // If this OP (say A), is an immediate child of another OP (say B), this field
-  // stores the sum of duration in microseconds of A inside B. If A appears more
-  // than once in B, the duration of all A's appearances will be added together.
-  // This sum will be reset after the self-time of B is calculated so that it
-  // can be reused for a new parent OP.
-  optional double sum_of_duration_in_us_as_children = 7;
-  // Number of instances that this OP occurred.
-  optional uint64 occurrences = 8;
-  // Total time in microseconds spent in this OP (accumulated
-  // over all of its occurrences).
-  optional double total_time_in_us = 9;
-  // Total self time in microseconds spent in this OP
-  // (accumulated over all of its occurrences).
-  optional double total_self_time_in_us = 10;
-  // The total self time as a fraction of sum of all OP's
-  // total self time on the host.
-  optional double host_total_self_time_as_fraction_of_all_op_time = 11;
-  // Cumulative total self time in fraction on the host.
-  optional double host_cumulative_total_self_time_as_fraction_of_all_op_time =
-      12;
-  // The total self time as a fraction of sum of all OP's
-  // total self time on the device.
-  optional double device_total_self_time_as_fraction_of_all_op_time = 13;
-  // Cumulative total self time in fraction on the device.
-  optional double device_cumulative_total_self_time_as_fraction_of_all_op_time =
-      14;
-  // Total number of FLOPs incurred by this OP.
-  optional double total_flops = 15;
-  // Total number of bytes accessed by this OP.
-  optional double total_bytes_accessed = 16;
-  // Total time in microseconds that special hw unit 1 is occupied by this OP.
-  optional double unit1_occupancy_in_us = 17;
-  // Total time in microseconds that special hw unit 2 is occupied by this OP.
-  optional double unit2_occupancy_in_us = 18;
-  // Total memory stall time in microseconds.
-  optional double total_memory_stall_in_us = 19;
-}
-
-// Result proto for OpMetricsDb.
-message OpMetricsDbResult {
-  // A bunch of OpMetricsResults.
-  repeated OpMetricsResult metrics_db = 1;
-  // The total host infeed-enqueue duration in picoseconds.
-  optional uint64 total_host_infeed_enq_duration_ps = 2;
-  // The total of the difference between the start times of two
-  // consecutive infeed-enqueues (per host) in picoseconds.
-  optional uint64 total_host_infeed_enq_start_timestamp_ps_diff = 3;
-  // The total device time in microseconds.
-  optional double total_device_time_in_us = 4;
-  // The total host time in microseconds.
-  optional double total_host_time_in_us = 5;
-}
-
-// Result proto for StepInfo.
-message StepInfoResult {
-  // The (micro) step number.
-  optional uint32 step_num = 1;
-  // The step duration in picoseconds.
-  optional uint64 duration_ps = 2;
-  // The infeed duration in picoseconds.
-  optional uint64 infeed_duration_ps = 3;
-  // The outfeed duration in picoseconds.
-  optional uint64 host_outfeed_ps = 8;
-  // The start time of this step in picoseconds.
-  optional uint64 begin_ps = 4;
-  // The waiting time within this step in picoseconds.
-  optional uint64 wait_duration_ps = 5;
-  // The unit b outfeed duration in picoseconds.
-  optional uint64 unit_b_outfeed_ps = 9;
-  // The time spent on cross-replica-sum in picoseconds.
-  optional uint64 crs_duration_ps = 6;
-  // Percentage of unit b time spent on infeed.
-  optional double unit_b_infeed_percent = 7;
-}
-
-// Result proto for a sequence of steps.
-message StepSequenceResult {
-  // A sequence of StepInfoResults.
-  repeated StepInfoResult step_sequence = 1;
-}
-
-// Result proto for a StepDatabase.
-message StepDatabaseResult {
-  // A map from core_id to StepSequenceResult.
-  map<uint32, StepSequenceResult> step_sequence_per_core = 1;
-}
-
-// Result proto for looping-related metrics.
-message LoopingResult {
-  // The total iteration time in nanoseconds.
-  optional double iteration_time_ns = 1;
-  // The total number of iterations.
-  optional int32 num_iterations = 2;
-  // The total computation time in nanoseconds.
-  optional double computation_time_ns = 3;
-  // The total number of computations.
-  optional int32 num_computations = 4;
-}
-
-// Result proto for HloExtraInfo.
-message HloExtraInfoResult {
-  // Category of the HLO op given by the compiler.
-  optional string category = 1;
-  // The long name of the HLO that includes the dimensions.
-  optional string long_name = 2;
-  // The per-TPU-core batch size inferred from this HLO.
-  optional int64 per_core_batch_size = 3;
-}
-
-// Result proto for HloExtraInfoMap.
-message HloExtraInfoMapResult {
-  // A map from HLO name to HloExtraInfo.
-  map<string, HloExtraInfoResult> hlo_extrainfo_map = 1;
-}
-
-// Result proto for host-independent job information.
-message HostIndependentJobInfoResult {
-  // The change-list number of this build.
-  optional int64 change_list = 1;
-  // The time of this build.
-  optional int64 build_time = 2;
-  // The target of this build.
-  optional string build_target = 3;
-}
-
-// Result proto for host-dependent job information.
-message HostDependentJobInfoResult {
-  // This ID of the host where the job was run on.
-  optional string host_id = 1;
-  // The command line used to run the job.
-  optional string command_line = 2;
-  // The start time of the job on this host.
-  optional int64 start_time = 3;
-}
-
-// Result proto for RunEnvironment (the run environment of a profiling session).
-message RunEnvironmentResult {
-  // Number of hosts used.
-  optional int32 host_count = 1;
-  // The type of TPU used.
-  optional string tpu_type = 2;
-  // The number of TPU cores used.
-  optional int32 tpu_core_count = 3;
-  // The per-TPU-core batch size.
-  optional int32 per_core_batch_size = 4;
-  // Host-independent job information.
-  optional HostIndependentJobInfoResult host_independent_job_info = 5;
-  // Host-dependent job information.
-  repeated HostDependentJobInfoResult host_dependent_job_info = 6;
-  // The number of replicas, corresponds to input parallelism.
-  // If there is no model parallelism, replica_count = tpu_core_count
-  optional int32 replica_count = 7;
-  // The number of cores used for a single replica, e.g. model parallelism.
-  // If there is no model parallelism, then num_cores_per_replica = 1
-  optional int32 num_cores_per_replica = 8;
-}
-
-// The types of host operations that are tracked.
-enum HostOp {
-  // Invalid host op.
-  kINVALIDHostOp = 0;
-  // Each of host op type has two parts:
-  // (1) the stage where the op happens and (2) the op name.
-  // stage = Input Data Producer, op = Get Next Batch.
-  kInputDataProducerGetNextBatch = 1;
-  // stage = Input Data Producer, op = Session Run.
-  kInputDataProducerSessionRun = 2;
-  // stage = Input Data Producer, op = Forward Batch.
-  kInputDataProducerForwardBatch = 3;
-  // stage = Infeed Thread, op = Get Next Batch.
-  kInfeedThreadGetNextBatch = 4;
-  // stage = Infeed Thread, op = Session Run.
-  kInfeedThreadSessionRun = 5;
-  // stage = Infeed Thread, op = Forward Batch.
-  kInfeedThreadForwardBatch = 6;
-  // stage = Outfeed Thread, op = Get Next Batch.
-  kOutfeedThreadGetNextBatch = 7;
-  // stage = Outfeed Thread, op = Session Run.
-  kOutfeedThreadSessionRun = 8;
-  // stage = Outfeed Thread, op = Forward Batch.
-  kOutfeedThreadForwardBatch = 9;
-}
-
-// Result proto for the host ops per TPU step.
-message HostOpsPerTpuStep {
-  // Whether the data in this message is valid.
-  optional bool valid = 1 [default = false];
-  // The current TPU step number.
-  optional uint32 tpu_step_num = 2;
-  // The beginning time of the current TPU step on the device in picoseconds.
-  optional uint64 tpu_step_begin_ps = 3;
-  // The ending time of the current TPU step on the device in picoseconds.
-  optional uint64 tpu_step_end_ps = 4;
-  // For each possible host operation, maps to the difference between the TPU
-  // step number that the host op targets and the current TPU step number.
-  // The key is HostOp, value is the step difference.
-  map<int32, int32> step_diffs = 5;
-}
-
-message HostOpsDetailsPerCore {
-  // Map from core id to HostOpsPerTpuStep.
-  map<int32, HostOpsPerTpuStep> core_map = 1;
-}
-
-message HostOpsDetailsPerHost {
-  // Map from hostname to a map from core id to HostOpsPerTpuStep.
-  map<string, HostOpsDetailsPerCore> host_map = 1;
-}
-
-// Result proto for the host ops for all TPU steps.
-message HostOpsResult {
-  reserved 1;  // (was repeated HostOpsPerTpuStep host_op_sequence)
-  // A sequence of records with one for each TPU step. Each record
-  // is a map from hostname to a map from core id to HostOpsPerTpuStep.
-  repeated HostOpsDetailsPerHost hostops_details = 2;
-}
-
-// Result proto for TfStatsHelper.
-message TfOpStats {
-  // The result for the TF-metric database.
-  optional OpMetricsDbResult tf_metrics_db = 1;
-  // The result for the HLO-metric database.
-  optional OpMetricsDbResult hlo_metrics_db = 2;
-  // The result for the step database.
-  optional StepDatabaseResult step_db = 3;
-  // The result for the looping-related metrics.
-  optional LoopingResult looping = 4;
-  // The result for the HloExtraInfoMap.
-  optional HloExtraInfoMapResult hlo_extrainfo_map = 5;
-  // Overall matrix unit utilization in percentage.
-  optional double matrix_unit_utilization_percent = 6;
-  // The run environment of this profiling session.
-  optional RunEnvironmentResult run_environment = 7;
-  // The result for the host operations.
-  optional HostOpsResult host_ops = 8;
-  // A map from core ID to name.
-  map<uint32, string> core_id_to_name_map = 9;
-  // The result for hw unit b stats.
-  optional bytes unit_b_stats = 10;
-}
diff --git a/tensorflow/contrib/tpu/proto/optimization_parameters.proto b/tensorflow/contrib/tpu/proto/optimization_parameters.proto
index c2e3be03db0..aae1ab1d37a 100644
--- a/tensorflow/contrib/tpu/proto/optimization_parameters.proto
+++ b/tensorflow/contrib/tpu/proto/optimization_parameters.proto
@@ -154,6 +154,14 @@ message OptimizationParameters {
   // updates; not present means no limits are applied.
   ClippingLimits gradient_clipping_limits = 7;
 
+  // Amount of weight decay to apply; see weight_decay_optimizers.py for
+  // details. Almost all optimizers are supported with this option (MDL Adagrad
+  // Light does not work, and SGD does not behave as expected if it is enabled).
+  // Although there is no check, users who want weight decay will probably also
+  // want to enable gradient accumulation as well so that the decay will happen
+  // once per minibatch.
+  float weight_decay_factor = 16;
+
   // Whether to use gradient accumulation (do two passes over the input
   // gradients: one to accumulate them into a temporary array and another to
   // apply them using the actual optimization algorithm). This feature is
diff --git a/tensorflow/contrib/tpu/python/tpu/async_checkpoint.py b/tensorflow/contrib/tpu/python/tpu/async_checkpoint.py
index c32bd5997c1..1cf7f9fcf67 100644
--- a/tensorflow/contrib/tpu/python/tpu/async_checkpoint.py
+++ b/tensorflow/contrib/tpu/python/tpu/async_checkpoint.py
@@ -164,14 +164,15 @@ class AsyncCheckpointSaverHook(basic_session_run_hooks.CheckpointSaverHook):
           SessionLog(
               status=SessionLog.CHECKPOINT, checkpoint_path=self._save_path),
           step)
+
+      for l in self._listeners:
+        l.after_save(session, step)
+
       end_time = time.time()
       logging.info("Checkpoint actual writing time: (%.3f sec)",
                    end_time - start_time)
       logging.info("Checkpoint finished for %d into %s.", step, self._save_path)
 
-    for l in self._listeners:
-      l.before_save(session, step)
-
     if not asynchronous:
       _save_fn()
       return
diff --git a/tensorflow/contrib/tpu/python/tpu/keras_support.py b/tensorflow/contrib/tpu/python/tpu/keras_support.py
index ce2c322ff49..08f58a5f5b8 100644
--- a/tensorflow/contrib/tpu/python/tpu/keras_support.py
+++ b/tensorflow/contrib/tpu/python/tpu/keras_support.py
@@ -1184,8 +1184,7 @@ class TPUFunction(object):
       # pipelined loop.
       return None, None
 
-    if (self.model.uses_learning_phase and
-        not isinstance(K.learning_phase(), int)):
+    if not isinstance(K.learning_phase(), int):
       # Remove the learning_phase flag at the end. We currently hard code the
       # learning_phase in TPUFunction.
       assert isinstance(inputs[-1], int), (
@@ -1651,7 +1650,7 @@ class KerasTPUModel(models.Model):
     self._make_train_function()
     sample_weights = sample_weights or []
     val_sample_weights = val_sample_weights or []
-    if self.uses_learning_phase and not isinstance(K.learning_phase(), int):
+    if not isinstance(K.learning_phase(), int):
       ins = inputs + targets + sample_weights + [1]
     else:
       ins = inputs + targets + sample_weights
diff --git a/tensorflow/contrib/tpu/python/tpu/tensor_tracer.py b/tensorflow/contrib/tpu/python/tpu/tensor_tracer.py
new file mode 100644
index 00000000000..70baea203cc
--- /dev/null
+++ b/tensorflow/contrib/tpu/python/tpu/tensor_tracer.py
@@ -0,0 +1,553 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ========================================================================
+"""A utility to trace tensor values on TPU."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import os.path
+import re
+
+from tensorflow.contrib.tpu.python.ops import tpu_ops
+from tensorflow.contrib.tpu.python.tpu import tpu
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_util
+from tensorflow.python.ops import gen_math_ops
+from tensorflow.python.ops import logging_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import tf_logging as logging
+
+_TRACER_LOG_PREFIX = ' [>>>TT>>>]'
+_DEVICE_TYPE_TPU = 'tpu'
+_DEVICE_TYPE_CPU = 'cpu'
+_GLOBAL_STEP_OP_NAME = 'GLOBAL-STEP'
+_TRACE_MODE_NAN_INF = 'nan-inf'
+_TRACE_MODE_PART_TENSOR = 'part-tensor'
+_TRACE_MODE_PART_TENSOR_SIZE = 3
+_TRACE_MODE_FULL_TENSOR = 'full-tensor'
+_RECORD_OUTSIDE_OP_RANGE = 'not-traced-outside-op-range'
+_RECORD_SHOULD_NOT_TRACE = 'not-traced-should-not-trace'
+_RECORD_FILTERED_OUT = 'not-traced-filtered-out'
+_RECORD_SCALAR = 'not-traced-scalar'
+_RECORD_DYNAMIC_SHAPE = 'not-traced-dynamic-shape'
+_RECORD_GET_TRACED = 'get-traced'
+_MARKER_SECTION_BEGIN = '!!!!!!! section-begin:'
+_MARKER_SECTION_END = '!!!!!!! section-end:'
+_SECTION_NAME_CONFIG = 'configuration'
+_SECTION_NAME_REASON = 'reason'
+_SECTION_NAME_OP_LIST = 'op-list'
+_SECTION_NAME_GRAPH = 'graph'
+_FIELD_NAME_VERSION = 'version:'
+_FIELD_NAME_DEVICE = 'device:'
+_FIELD_NAME_TRACE_MODE = 'trace-mode:'
+_FIELD_NAME_NUM_REPLICAS = 'num-replicas:'
+_FIELD_NAME_NUM_OPS = 'number-of-ops:'
+_FIELD_NAME_TOPOLOGICAL_SORT_SUCCEED = 'topological-sort-succeed:'
+_FLAGS_ENV_VAR = 'TENSOR_TRACER_FLAGS'
+_FLAG_SINGLE_QUOTE_PAT = re.compile(r"\s*--([^=]+)='([^']*)'")
+_FLAG_DOUBLE_QUOTE_PAT = re.compile(r'\s*--([^=]+)="([^"]*)"')
+_FLAG_NO_QUOTE_PAT = re.compile(r'\s*--([^=]+)=(\S*)')
+_FLAG_NAME_ENABLE = 'enable'
+_FLAG_NAME_TRACE_MODE = 'trace_mode'
+_FLAG_NAME_INTERESTING_OPS = 'interesting_ops'
+_FLAG_NAME_TRACE_FILE = 'trace_file_path'
+_FLAG_NAME_USE_TEST_UNDECLARED_OUTPUTS_DIR = 'use_test_undeclared_outputs_dir'
+_FLAG_NAME_OP_RANGE = 'op_range'
+_OP_RANGE_PAT = re.compile(r'(\d+):(\d+)')
+_OUTPUT_STREAM_ESCAPE = 'file://'
+_TEST_UNDECLARED_OUTPUTS_DIR_ENV_VAR = 'TEST_UNDECLARED_OUTPUTS_DIR'
+
+
+class TensorTracer(object):
+  """A software construct for tracing tensor values in a TF graph on TPU.
+
+  This utility is disabled by default. It can be enabled by setting
+  the TENSOR_TRACER_FLAGS env variable as:
+    export TENSOR_TRACER_FLAGS="--enable=1"
+  If it is enabled, it will trace the output tensor values of
+  selected Ops in the graph. It has two outputs: (1) the traces and (2)
+  a report. The traces are dumped to a specified local file on the TPU
+  host. The report is printed to the log.info of the TPU job.
+  By passing options via the env variable, users can change:
+     (1) the trace mode (e.g., detecting NaN/Inf, printing partial or
+         full tensor values)
+     (2) which Ops to be traced (via op.name or op.type)
+     (3) output trace file path.
+  """
+
+  @staticmethod
+  def _match_next_flag(flags, pos):
+    """Returns the match for the next TensorTracer flag."""
+
+    match = _FLAG_DOUBLE_QUOTE_PAT.match(flags, pos)
+    if match:
+      return match
+    match = _FLAG_SINGLE_QUOTE_PAT.match(flags, pos)
+    if match:
+      return match
+    match = _FLAG_NO_QUOTE_PAT.match(flags, pos)
+    return match
+
+  @staticmethod
+  def print_flag_values():
+    """Prints all TensorTracer flags passed via environment variables."""
+
+    tensor_tracer_flags = os.environ.get(_FLAGS_ENV_VAR)
+    if not tensor_tracer_flags:
+      return 'Env variable "%s" is not set'%_FLAGS_ENV_VAR
+    result = 'Env variable "%s" is set to "%s"\n'%(_FLAGS_ENV_VAR,
+                                                   tensor_tracer_flags)
+    result += 'Individual flag value:\n'
+    pos = 0
+    while True:
+      match = TensorTracer._match_next_flag(tensor_tracer_flags, pos)
+      if not match:
+        break
+      flag_name = match.group(1)
+      flag_value = match.group(2)
+      result += '  %s: %s\n'%(flag_name, flag_value)
+      pos = match.end()
+    result += '\n'
+    return result
+
+  @staticmethod
+  def get_flag_value(wanted_flag_name):
+    """Returns the value of a TensorTracer flags."""
+
+    tensor_tracer_flags = os.getenv(_FLAGS_ENV_VAR)
+    if not tensor_tracer_flags:
+      return ''
+    pos = 0
+    while True:
+      match = TensorTracer._match_next_flag(tensor_tracer_flags, pos)
+      if not match:
+        return ''
+      flag_name = match.group(1)
+      flag_value = match.group(2)
+      if flag_name == wanted_flag_name:
+        return flag_value
+      pos = match.end()
+    return ''
+
+  @staticmethod
+  def is_enabled():
+    """Returns True if TensorTracer is enabled."""
+
+    flag_value = TensorTracer.get_flag_value(_FLAG_NAME_ENABLE)
+    flag_value = flag_value.lower()
+    enabled = flag_value in ['1', 't', 'true', 'y', 'yes']
+    return enabled
+
+  @staticmethod
+  def use_test_undeclared_outputs_dir():
+    """Decides the output directory of the trace file.
+
+    Args:
+       None.
+
+    Returns:
+       True if the output trace file should be written to the
+       test-undeclared-outputs-directory defined via an
+       env variable.
+    """
+
+    flag_value = TensorTracer.get_flag_value(
+        _FLAG_NAME_USE_TEST_UNDECLARED_OUTPUTS_DIR)
+    flag_value = flag_value.lower()
+    enabled = flag_value in ['1', 't', 'true', 'y', 'yes']
+    return enabled
+
+  @staticmethod
+  def check_device_type(device_type):
+    """Checks if the given device type is valid."""
+
+    if device_type not in [_DEVICE_TYPE_TPU, _DEVICE_TYPE_CPU]:
+      raise ValueError('Invalid device_type "%s"'%device_type)
+
+  @staticmethod
+  def check_trace_mode(trace_mode):
+    """Checks if the given trace mode is valid."""
+
+    valid_trace_modes = [_TRACE_MODE_NAN_INF, _TRACE_MODE_PART_TENSOR,
+                         _TRACE_MODE_FULL_TENSOR]
+    if trace_mode not in valid_trace_modes:
+      raise ValueError('Invalid trace mode "%s" given to the Tensor_Tracer.'
+                       'Valid trace modes are: %s'%(trace_mode,
+                                                    valid_trace_modes))
+
+  @staticmethod
+  def should_trace(device_type, op):
+    """Returns True if the given Op should be traced."""
+
+    if device_type != _DEVICE_TYPE_TPU:
+      raise ValueError('Non TPU device type is not supported')
+    if control_flow_util.IsInCond(op):
+      return False
+    if op.type in ['Reshape', 'ArgMin', 'ArgMax']:
+      return False
+    # pylint: disable=protected-access
+    return tpu._TPU_REPLICATE_ATTR in op.node_def.attr
+    # pylint: enable=protected-access
+
+  @staticmethod
+  def reason(op_idx, details):
+    """Returns why the Op at op_idx is traced or not."""
+    return '%d %s'%(op_idx, details)
+
+  @staticmethod
+  def topological_sort(g):
+    """Performs topological sort on the given graph.
+
+    Args:
+       g: the graph.
+
+    Returns:
+       A pair where the first element indicates if the topological
+       sort succeeded (True if there is no cycle found; False if a
+       cycle is found) and the second element is either the sorted
+       list of nodes or the cycle of nodes found.
+    """
+
+    def visit(op, cycle, permanently_marked_ops,
+              temporarily_marked_ops, sorted_ops):
+      """Recursively visits all Ops in a graph.
+
+      Args:
+         op: the current Op being visited.
+         cycle: a cycle of Ops found.
+         permanently_marked_ops: the set of Ops that were already visited.
+         temporarily_marked_ops: the set of Ops that we have visited during
+                                 the current descent.
+         sorted_ops: the list of Ops sorted in topological order.
+      """
+
+      if cycle:
+        return
+      if op in permanently_marked_ops:
+        return
+      if op in temporarily_marked_ops:
+        cycle = temporarily_marked_ops
+        return
+      temporarily_marked_ops.add(op)
+      for i in range(len(op.outputs)):
+        out_tensor = op.outputs[i]
+        for consumer_op in out_tensor.consumers():
+          visit(consumer_op, cycle, permanently_marked_ops,
+                temporarily_marked_ops, sorted_ops)
+      # pylint: disable=protected-access
+      for ctrl_output_op in op._control_outputs:
+      # pylint: enable=protected-access
+        visit(ctrl_output_op, cycle, permanently_marked_ops,
+              temporarily_marked_ops, sorted_ops)
+      temporarily_marked_ops.remove(op)
+      permanently_marked_ops.add(op)
+      sorted_ops.insert(0, op)
+
+    graph_cycle = set([])
+    sorted_ops = []
+    permanently_marked_ops = set([])
+    temporarily_marked_ops = set([])
+    unsorted_ops = g.get_operations()
+    for op in unsorted_ops:
+      visit(op, graph_cycle, permanently_marked_ops,
+            temporarily_marked_ops, sorted_ops)
+    if graph_cycle:
+      return (False, graph_cycle)
+    else:
+      assert len(unsorted_ops) == len(sorted_ops)
+      return (True, sorted_ops)
+
+  def __init__(self):
+    """Initializes a TensorTracer.
+
+    Sets the various member fields from the flags (if given) or the defaults.
+    """
+    self._version = 'use-outside-compilation'
+    self._device_type = None
+    self._trace_mode = TensorTracer.get_flag_value(_FLAG_NAME_TRACE_MODE)
+    if not self._trace_mode:
+      self._trace_mode = _TRACE_MODE_NAN_INF
+    TensorTracer.check_trace_mode(self._trace_mode)
+    self._part_tensor_size = _TRACE_MODE_PART_TENSOR_SIZE
+    self._instrument_records = {}
+    interesting_ops = TensorTracer.get_flag_value(_FLAG_NAME_INTERESTING_OPS)
+    self._selected_ops = interesting_ops.split()
+    self._set_trace_file_path()
+    self._set_op_range()
+    self._num_replicas = None
+    self._replica_id = None
+
+  def _add_replica_id_to_graph(self, num_replicas, result_tensor):
+    """Adds nodes for computing the replica ID to the graph."""
+
+    if not num_replicas:
+      self._replica_id = 'unknown'
+      return result_tensor
+
+    self._num_replicas = num_replicas
+
+    with ops.control_dependencies(None):
+      # Uses None as dependency to run outside of TPU graph rewrites.
+      self._replica_id = tpu_ops.tpu_replicated_input(
+          list(range(self._num_replicas)),
+          name='tt_replica_id')
+    use_replica_id = array_ops.identity(self._replica_id).op
+    with ops.control_dependencies([use_replica_id]):
+      # Adds a control dependency from the result_tensor to
+      # the replica_id to ensure that replica_id will be added to the graph.
+      return array_ops.identity(result_tensor)
+
+  def _set_trace_file_path(self):
+    """Sets the path of the output trace file."""
+
+    self._trace_file_path = TensorTracer.get_flag_value(_FLAG_NAME_TRACE_FILE)
+    if not self._trace_file_path:
+      raise ValueError('--%s is not set in the environment variable %s'
+                       %(_FLAG_NAME_TRACE_FILE, _FLAGS_ENV_VAR))
+    elif TensorTracer.use_test_undeclared_outputs_dir():
+      if os.path.isabs(self._trace_file_path):
+        raise ValueError('If use_test_undeclared_outputs_dir is set,'
+                         'trace_file_path cannot be an absolute path (%s)'
+                         %self._trace_file_path)
+      outputs_dir = os.environ.get(_TEST_UNDECLARED_OUTPUTS_DIR_ENV_VAR)
+      self._trace_file_path = os.path.join(outputs_dir,
+                                           self._trace_file_path)
+
+  def _set_op_range(self):
+    """Sets the index range of the Ops that we will consider tracing."""
+
+    op_range = TensorTracer.get_flag_value(_FLAG_NAME_OP_RANGE)
+    if not op_range:
+      self._op_range = (-1, -1)  # this means including all ops.
+      return
+    match = _OP_RANGE_PAT.match(op_range)
+    if not match:
+      self._op_range = (-1, -1)  # this means including all ops.
+      return
+    self._op_range = (int(match.group(1)), int(match.group(2)))
+
+  def _inside_op_range(self, idx):
+    """Return True if the given index is inside the selected range."""
+
+    if idx < self._op_range[0]:
+      return False
+    return self._op_range[1] < 0 or idx <= self._op_range[1]
+
+  def _write_report(self, content):
+    """Writes the given content to the report."""
+
+    logging.info('%s %s'%(_TRACER_LOG_PREFIX, content))
+
+  def _is_selected_op(self, op_name):
+    """Returns True if the Op with op_name is selected to be traced."""
+
+    if not self._selected_ops:
+      return True
+    if op_name in self._selected_ops:
+      return True
+    return False
+
+  def _write_config_section(self):
+    """Writes the config section of the report."""
+
+    self._write_report('%s %s\n'%(_MARKER_SECTION_BEGIN, _SECTION_NAME_CONFIG))
+    self._write_report('%s %s\n'%(_FIELD_NAME_VERSION, self._version))
+    self._write_report('%s %s\n'%(_FIELD_NAME_DEVICE, self._device_type))
+    self._write_report('%s %s\n'%(_FIELD_NAME_TRACE_MODE, self._trace_mode))
+    self._write_report('%s %s\n'%(_FIELD_NAME_NUM_REPLICAS, self._num_replicas))
+    self._write_report('%s %s\n'%(_MARKER_SECTION_END, _SECTION_NAME_CONFIG))
+
+  def _write_reason_section(self):
+    """Writes the reason section of the report."""
+
+    self._write_report('%s %s\n'%(_MARKER_SECTION_BEGIN, _SECTION_NAME_REASON))
+    for key in sorted(self._instrument_records):
+      self._write_report('"%s" %s\n'%(key, self._instrument_records[key]))
+    self._write_report('%s %s\n'%(_MARKER_SECTION_END, _SECTION_NAME_REASON))
+
+  def _write_op_list_section(self, op_list):
+    """Writes the Op-list section of the report."""
+
+    self._write_report('%s %s\n'%(_MARKER_SECTION_BEGIN, _SECTION_NAME_OP_LIST))
+    self._write_report('%s %d\n'%(_FIELD_NAME_NUM_OPS, len(op_list)))
+    for i in range(0, len(op_list)):
+      self._write_report('%d "%s" %s\n'%(i, op_list[i].name, op_list[i].type))
+    self._write_report('%s %s\n'%(_MARKER_SECTION_END, _SECTION_NAME_OP_LIST))
+
+  def _write_graph_section(self, succeed, sorted_or_cycle):
+    """Writes the graph section of the report."""
+
+    self._write_report('%s %s\n'%(_MARKER_SECTION_BEGIN, _SECTION_NAME_GRAPH))
+    self._write_report('%s %s\n'%(_FIELD_NAME_TOPOLOGICAL_SORT_SUCCEED,
+                                  succeed))
+    l = list(sorted_or_cycle)
+    for i in range(0, len(l)):
+      self._write_report('%d "%s"\n'%(i, l[i].name))
+    self._write_report('%s %s\n'%(_MARKER_SECTION_END, _SECTION_NAME_GRAPH))
+
+  def _make_tensor_trace_fun(self, op_name, output_idx):
+    """Makes the tensor tracing function called by outside compilation.
+
+    Args:
+      op_name: the name of the Op that outputs the tensor to be traced.
+      output_idx: which output of the Op it is (0 means the first output).
+
+    Returns:
+      A function to be passed as the first argument to outside compilation.
+
+    Raises:
+      RuntimeError: If the trace mode is invalid.
+    """
+
+    def _print_tensor(op_name, output_idx, num_elements, tensor, output_tensor):
+      """Prints a tensor value to a file.
+
+      Args:
+        op_name: the name of the Op that outputs the tensor to be printed.
+        output_idx: which output of the Op it is (0 means the first output).
+        num_elements: number of elements to print.
+        tensor: the tensor needs to be returned.
+        output_tensor: the tensor needs to be printed.
+
+      Returns:
+        The same tensor passed via the "tensor" argument.
+      """
+      msg = '"%s:%d" '%(op_name, output_idx)
+      output_stream = _OUTPUT_STREAM_ESCAPE + self._trace_file_path
+      print_op = logging_ops.print_v2(msg, array_ops.shape(output_tensor),
+                                      ' @', self._replica_id,
+                                      '\n', output_tensor,
+                                      summarize=num_elements,
+                                      output_stream=output_stream)
+      with ops.control_dependencies([print_op]):
+        return array_ops.identity(tensor).op
+
+    def _detect_nan_inf(tensor):
+      """Trace function for detecting any NaN/Inf in the tensor."""
+
+      if tensor.dtype.is_floating:
+        # Since host can't handle bf16, always convert tensor to f32.
+        tensor = math_ops.cast(tensor, dtypes.float32)
+        output_tensor = math_ops.reduce_any(
+            gen_math_ops.logical_or(gen_math_ops.is_nan(tensor),
+                                    gen_math_ops.is_inf(tensor)))
+      else:
+        output_tensor = constant_op.constant(0)
+      return _print_tensor(op_name, output_idx, 1, tensor, output_tensor)
+
+    def _show_global_step(tensor):
+      """Trace function for printing the global step count."""
+
+      return _print_tensor(op_name, output_idx, 1, tensor, tensor)
+
+    def _show_part_tensor(tensor):
+      """Trace function for printing part of the tensor."""
+
+      return _print_tensor(op_name, output_idx, self._part_tensor_size,
+                           tensor, tensor)
+
+    def _show_full_tensor(tensor):
+      """Trace function for printing the entire tensor."""
+
+      return _print_tensor(op_name, output_idx, -1, tensor, tensor)
+
+    if op_name == _GLOBAL_STEP_OP_NAME:
+      return _show_global_step
+    if self._trace_mode == _TRACE_MODE_NAN_INF:
+      return _detect_nan_inf
+    if self._trace_mode == _TRACE_MODE_PART_TENSOR:
+      return _show_part_tensor
+    if self._trace_mode == _TRACE_MODE_FULL_TENSOR:
+      return _show_full_tensor
+
+    raise RuntimeError('Tensor trace fun for %s is not yet implemented'
+                       %self._trace_mode)
+
+  def trace_tpu(self, graph, result_tensor, num_replicas=None):
+    """Traces the tensors generated by TPU Ops in a TF graph.
+
+    Args:
+      graph: the graph of Ops.
+      result_tensor: a result tensor of evaluating the graph.
+      num_replicas: number of replicas used on the TPU.
+
+    Returns:
+      A tuple (result_tensor_copy, tracing_ops), where:
+        result_tensor_copy: an exact copy of result_tensor
+        tracing_ops: a list of tracing ops. If this list
+                     is non empty, the caller of this function
+                     should pose control dependencies upon these
+                     Ops so that they will be executed when the
+                     graph is evaluated.
+    """
+
+    self._device_type = _DEVICE_TYPE_TPU
+    TensorTracer.check_device_type(self._device_type)
+    result_tensor_copy = self._add_replica_id_to_graph(num_replicas,
+                                                       result_tensor)
+    self._write_config_section()
+    tracing_ops = []
+    operations = graph.get_operations()
+    self._write_op_list_section(operations)
+    # Does the topological sort before adding any nodes to the graph.
+    (succeed, sorted_or_cycle) = TensorTracer.topological_sort(graph)
+    for op_id, op in enumerate(operations):
+      if not self._inside_op_range(op_id):
+        self._instrument_records[op.name] = TensorTracer.reason(
+            op_id, _RECORD_OUTSIDE_OP_RANGE)
+        continue
+      if not TensorTracer.should_trace(self._device_type, op):
+        self._instrument_records[op.name] = TensorTracer.reason(
+            op_id, _RECORD_SHOULD_NOT_TRACE)
+        continue
+      if not self._is_selected_op(op.name):
+        self._instrument_records[op.name] = TensorTracer.reason(
+            op_id, _RECORD_FILTERED_OUT)
+        continue
+      for i in range(len(op.outputs)):
+        out_tensor = op.outputs[i]
+        if not out_tensor.get_shape().is_fully_defined():
+          self._instrument_records[out_tensor.name] = TensorTracer.reason(
+              op_id, _RECORD_DYNAMIC_SHAPE)
+          continue  # cannot trace tensors with dynamic shape.
+        rank = len(out_tensor.shape)
+        if rank < 1:
+          self._instrument_records[out_tensor.name] = TensorTracer.reason(
+              op_id, _RECORD_SCALAR)
+          continue  # cannot trace scalar.
+        self._instrument_records[out_tensor.name] = TensorTracer.reason(
+            op_id, _RECORD_GET_TRACED)
+        consumers = out_tensor.consumers()
+        trace_op = tpu.outside_compilation(
+            self._make_tensor_trace_fun(op.name, i), out_tensor)
+        if consumers:
+          for consumer_op in consumers:
+            # pylint: disable=protected-access
+            consumer_op._add_control_input(trace_op)
+            # pylint: enable=protected-access
+        else:
+          # if there is no consumer, we will add the control dependence later
+          # when we add the control dependency to the output operations.
+          tracing_ops.append(trace_op)
+
+    self._write_reason_section()
+    self._write_graph_section(succeed, sorted_or_cycle)
+
+    return (result_tensor_copy, tracing_ops)
diff --git a/tensorflow/contrib/tpu/python/tpu/topology.py b/tensorflow/contrib/tpu/python/tpu/topology.py
index b6bb5c6e56c..6ae718cc2c9 100644
--- a/tensorflow/contrib/tpu/python/tpu/topology.py
+++ b/tensorflow/contrib/tpu/python/tpu/topology.py
@@ -189,12 +189,13 @@ class Topology(object):
   def cpu_device_name_at_coordinates(self, device_coordinates, job=None):
     """Returns the CPU device attached to a logical core."""
     return _tpu_host_device_name(
-        job, self._topology_tasks[device_coordinates])
+        job, self._topology_tasks[tuple(device_coordinates)])
 
   def tpu_device_name_at_coordinates(self, device_coordinates, job=None):
     """Returns the name of the TPU device assigned to a logical core."""
-    return _tpu_device_name(job, self._topology_tasks[device_coordinates],
-                            self._topology_devices[device_coordinates])
+    return _tpu_device_name(job,
+                            self._topology_tasks[tuple(device_coordinates)],
+                            self._topology_devices[tuple(device_coordinates)])
 
   @property
   def num_tasks(self):
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
index 555ad0f1fdb..7cb8c4aa7f1 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
@@ -31,6 +31,7 @@ import six
 from six.moves import queue as Queue  # pylint: disable=redefined-builtin
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
+from tensorflow.contrib.tpu.python.tpu import tensor_tracer
 from tensorflow.contrib.tpu.python.ops import tpu_ops
 from tensorflow.contrib.tpu.python.tpu import error_handling
 from tensorflow.contrib.tpu.python.tpu import session_support
@@ -108,6 +109,15 @@ ops.register_proto_function(
     from_proto=resource_variable_ops._from_proto_fn)  # pylint: disable=protected-access
 
 
+def _is_iterable(obj):
+  """A Python 2 and 3 compatible util to check whether `obj` is iterable."""
+  try:
+    iter(obj)
+    return True
+  except TypeError:
+    return False
+
+
 def _create_global_step(graph):
   graph = graph or ops.get_default_graph()
   if training.get_global_step(graph) is not None:
@@ -1317,9 +1327,15 @@ class _ModelFnWrapper(object):
 
       captured_training_hooks.capture(estimator_spec.training_hooks)
 
+      tracing_ops = []
+      if tensor_tracer.TensorTracer.is_enabled():
+        tt = tensor_tracer.TensorTracer()
+        loss, tracing_ops = tt.trace_tpu(ops.get_default_graph(), loss,
+                                         self._ctx.num_replicas)
+
       # We must run train_op to update the variables prior to running the
       # outfeed.
-      with ops.control_dependencies([train_op]):
+      with ops.control_dependencies([train_op]+tracing_ops):
         host_call_outfeed_ops = []
         if (isinstance(estimator_spec, model_fn_lib._TPUEstimatorSpec)  # pylint: disable=protected-access
             and estimator_spec.host_call is not None):
@@ -2250,8 +2266,7 @@ class TPUEstimator(estimator_lib.Estimator):
         # Only fetching `tpu_tensors_on_cpu` does not trigger
         # TPU computation and blocks, so we add the control dependency here.
         control_inputs = (
-            tpu_tensors_on_cpu if isinstance(tpu_tensors_on_cpu,
-                                             (list, tuple)) else
+            tpu_tensors_on_cpu if _is_iterable(tpu_tensors_on_cpu) else
             (tpu_tensors_on_cpu,))
         with ops.control_dependencies(control_inputs):
           new_tensors.append(array_ops.identity(t))
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index afe4c46c8ef..a701b38d4b3 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -383,6 +383,7 @@ cc_library(
         ":lib_platform",
         ":platform_base",
         "//tensorflow/core/platform/default/build_config:port",
+        "@com_google_absl//absl/base",
         "@snappy",
     ],
 )
@@ -1057,6 +1058,7 @@ tf_gen_op_libs(
         "logging_ops",
         "manip_ops",
         "math_ops",
+        "mkl_nn_ops",
         "nccl_ops",
         "nn_ops",
         "no_op",
@@ -1229,7 +1231,7 @@ cc_library(
         ":training_ops_op_lib",
         ":user_ops_op_lib",
         ":word2vec_ops",
-    ] + tf_additional_cloud_op_deps(),
+    ] + if_mkl([":mkl_nn_ops_op_lib"]) + tf_additional_cloud_op_deps(),
     alwayslink = 1,
 )
 
@@ -1285,7 +1287,9 @@ cc_library(
         ":framework",
         ":lib",
         ":nn_ops_op_lib",
-    ],
+    ] + if_mkl([
+        ":mkl_nn_ops_op_lib",
+    ]),
     alwayslink = 1,
 )
 
@@ -1668,6 +1672,7 @@ cc_library(
     name = "mobile_additional_lib_deps",
     deps = tf_additional_lib_deps() + [
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/time",
     ],
 )
 
@@ -2168,6 +2173,7 @@ cc_library(
             "lib/**/*.cc",
             "platform/*.cc",
             "platform/profile_utils/**/*.cc",
+        ] + [
             "framework/resource_handle.cc",
             "util/env_var.cc",
         ],
@@ -2811,7 +2817,6 @@ tf_cuda_library(
         ":functional_ops_op_lib",
         "//tensorflow/core/kernels:bounds_check",
         "//tensorflow/core/kernels:required",
-        ":core_cpu_impl",
     ]),
     alwayslink = 1,
 )
@@ -2997,6 +3002,16 @@ cc_library(
     deps = [":lib_internal"],
 )
 
+tf_cuda_library(
+    name = "metrics",
+    srcs = ["common_runtime/metrics.cc"],
+    hdrs = ["common_runtime/metrics.h"],
+    deps = [
+        ":lib",
+        "@com_google_absl//absl/time",
+    ],
+)
+
 tf_cuda_library(
     name = "direct_session_internal",
     srcs = ["common_runtime/direct_session.cc"],
@@ -3013,10 +3028,12 @@ tf_cuda_library(
         ":graph",
         ":lib",
         ":lib_internal",
+        ":metrics",
         ":proto_text",
         ":protos_all_cc",
         "//tensorflow/core/debug:debug_graph_utils",
         "//tensorflow/core/kernels:function_ops",
+        "@com_google_absl//absl/time",
     ],
     alwayslink = 1,
 )
@@ -3048,7 +3065,9 @@ tf_cuda_library(
     ],
     copts = tf_copts(),
     cuda_deps = if_cuda_is_configured(tf_additional_cupti_wrapper_deps() + tf_additional_device_tracer_cuda_deps()),
-    visibility = ["//visibility:private"],
+    visibility = [
+        "//tensorflow:internal",
+    ],
     deps = [
         ":core_cpu_internal",
         ":lib",
diff --git a/tensorflow/core/api_def/base_api/api_def_BoostedTreesBucketize.pbtxt b/tensorflow/core/api_def/base_api/api_def_BoostedTreesBucketize.pbtxt
index cdaeb5091c7..bfaf3d2ea59 100644
--- a/tensorflow/core/api_def/base_api/api_def_BoostedTreesBucketize.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_BoostedTreesBucketize.pbtxt
@@ -4,7 +4,7 @@ op {
   in_arg {
     name: "float_values"
     description: <<END
-float; List of Rank 2 Tensor each containing float values for a single feature.
+float; List of Rank 1 Tensor each containing float values for a single feature.
 END
   }
   in_arg {
@@ -17,7 +17,7 @@ END
   out_arg {
     name: "buckets"
     description: <<END
-int; List of Rank 2 Tensors each containing the bucketized values for a single feature.
+int; List of Rank 1 Tensors each containing the bucketized values for a single feature.
 END
   }
   attr {
diff --git a/tensorflow/core/api_def/base_api/api_def_BoostedTreesMakeQuantileSummaries.pbtxt b/tensorflow/core/api_def/base_api/api_def_BoostedTreesMakeQuantileSummaries.pbtxt
index ca111af312d..e7a3ca3d9fd 100644
--- a/tensorflow/core/api_def/base_api/api_def_BoostedTreesMakeQuantileSummaries.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_BoostedTreesMakeQuantileSummaries.pbtxt
@@ -4,7 +4,7 @@ op {
   in_arg {
     name: "float_values"
     description: <<END
-float; List of Rank 2 Tensors each containing values for a single feature.
+float; List of Rank 1 Tensors each containing values for a single feature.
 END
   }
   in_arg {
@@ -22,8 +22,8 @@ END
   out_arg {
     name: "summaries"
     description: <<END
-float; List of Rank 2 Tensors each containing the quantile summary (value, weight,
-min_rank, max_rank) of a single feature.
+float; List of Rank 2 Tensors each containing the quantile summary
+(value, weight, min_rank, max_rank) of a single feature.
 END
   }
   attr {
@@ -35,6 +35,7 @@ END
   }
   summary: "Makes the summary of quantiles for the batch."
   description: <<END
-An op that takes a list of tensors and outputs the quantile summaries for each tensor.
+An op that takes a list of tensors (one tensor per feature) and outputs the
+quantile summaries for each tensor.
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_BoostedTreesQuantileStreamResourceDeserialize.pbtxt b/tensorflow/core/api_def/base_api/api_def_BoostedTreesQuantileStreamResourceDeserialize.pbtxt
new file mode 100644
index 00000000000..7e61e5fa93a
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_BoostedTreesQuantileStreamResourceDeserialize.pbtxt
@@ -0,0 +1,26 @@
+op {
+  graph_op_name: "BoostedTreesQuantileStreamResourceDeserialize"
+  visibility: HIDDEN
+  in_arg {
+    name: "quantile_stream_resource_handle"
+    description: <<END
+resource handle referring to a QuantileStreamResource.
+END
+  }
+  in_arg {
+    name: "bucket_boundaries"
+    description: <<END
+float; List of Rank 1 Tensors each containing the bucket boundaries for a feature.
+END
+  }
+  attr {
+    name: "num_streams"
+    description: <<END
+inferred int; number of features to get bucket boundaries for.
+END
+  }
+  summary: "Deserialize bucket boundaries and ready flag into current QuantileAccumulator."
+  description: <<END
+An op that deserializes bucket boundaries and are boundaries ready flag into current QuantileAccumulator.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ExperimentalMapDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_ExperimentalMapDataset.pbtxt
new file mode 100644
index 00000000000..e9619edcac1
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ExperimentalMapDataset.pbtxt
@@ -0,0 +1,5 @@
+op {
+  graph_op_name: "ExperimentalMapDataset"
+  summary: "Creates a dataset that applies `f` to the outputs of `input_dataset`."
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_FixedLengthRecordDatasetV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_FixedLengthRecordDatasetV2.pbtxt
new file mode 100644
index 00000000000..ad82eddb587
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_FixedLengthRecordDatasetV2.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "FixedLengthRecordDatasetV2"
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_SinkDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_SinkDataset.pbtxt
deleted file mode 100644
index b5758ddbfb0..00000000000
--- a/tensorflow/core/api_def/base_api/api_def_SinkDataset.pbtxt
+++ /dev/null
@@ -1,14 +0,0 @@
-op {
-  graph_op_name: "SinkDataset"
-  visibility: HIDDEN
-  in_arg {
-    name: "input_dataset"
-    description: <<END
-A variant tensor representing the input dataset.
-END
-  }
-  summary: "A placeholder for input pipeline graph optimizations."
-  description: <<END
-A placeholder for input pipeline graph optimizations.
-END
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_FFT.pbtxt b/tensorflow/core/api_def/python_api/api_def_FFT.pbtxt
index 7f4a2add4e7..33f87caa38c 100644
--- a/tensorflow/core/api_def/python_api/api_def_FFT.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_FFT.pbtxt
@@ -1,7 +1,11 @@
 op {
   graph_op_name: "FFT"
+  endpoint {
+    name: "signal.fft"
+  }
   endpoint {
     name: "spectral.fft"
+    deprecation_version: 2
   }
   endpoint {
     name: "fft"
diff --git a/tensorflow/core/api_def/python_api/api_def_FFT2D.pbtxt b/tensorflow/core/api_def/python_api/api_def_FFT2D.pbtxt
index 9ed1341dfe2..2273a757898 100644
--- a/tensorflow/core/api_def/python_api/api_def_FFT2D.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_FFT2D.pbtxt
@@ -1,9 +1,14 @@
 op {
   graph_op_name: "FFT2D"
+  endpoint {
+    name: "signal.fft2d"
+  }
   endpoint {
     name: "spectral.fft2d"
+    deprecation_version: 2
   }
   endpoint {
     name: "fft2d"
+    deprecation_version: 2
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_FFT3D.pbtxt b/tensorflow/core/api_def/python_api/api_def_FFT3D.pbtxt
index 5a4e1d6adf9..6a43b86e3d3 100644
--- a/tensorflow/core/api_def/python_api/api_def_FFT3D.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_FFT3D.pbtxt
@@ -1,9 +1,14 @@
 op {
   graph_op_name: "FFT3D"
+  endpoint {
+    name: "signal.fft3d"
+  }
   endpoint {
     name: "spectral.fft3d"
+    deprecation_version: 2
   }
   endpoint {
     name: "fft3d"
+    deprecation_version: 2
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_FixedLengthRecordDatasetV2.pbtxt b/tensorflow/core/api_def/python_api/api_def_FixedLengthRecordDatasetV2.pbtxt
new file mode 100644
index 00000000000..def9f85e02d
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_FixedLengthRecordDatasetV2.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "FixedLengthRecordDatasetV2"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_IFFT.pbtxt b/tensorflow/core/api_def/python_api/api_def_IFFT.pbtxt
index 0124721e1cb..68ef4833949 100644
--- a/tensorflow/core/api_def/python_api/api_def_IFFT.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_IFFT.pbtxt
@@ -1,7 +1,11 @@
 op {
   graph_op_name: "IFFT"
+  endpoint {
+    name: "signal.ifft"
+  }
   endpoint {
     name: "spectral.ifft"
+    deprecation_version: 2
   }
   endpoint {
     name: "ifft"
diff --git a/tensorflow/core/api_def/python_api/api_def_IFFT2D.pbtxt b/tensorflow/core/api_def/python_api/api_def_IFFT2D.pbtxt
index d6b36a314b8..47fb6fa191f 100644
--- a/tensorflow/core/api_def/python_api/api_def_IFFT2D.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_IFFT2D.pbtxt
@@ -1,9 +1,14 @@
 op {
   graph_op_name: "IFFT2D"
+  endpoint {
+    name: "signal.ifft2d"
+  }
   endpoint {
     name: "spectral.ifft2d"
+    deprecation_version: 2
   }
   endpoint {
     name: "ifft2d"
+    deprecation_version: 2
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_IFFT3D.pbtxt b/tensorflow/core/api_def/python_api/api_def_IFFT3D.pbtxt
index 6def5b36da1..aff598314b2 100644
--- a/tensorflow/core/api_def/python_api/api_def_IFFT3D.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_IFFT3D.pbtxt
@@ -1,9 +1,14 @@
 op {
   graph_op_name: "IFFT3D"
+  endpoint {
+    name: "signal.ifft3d"
+  }
   endpoint {
     name: "spectral.ifft3d"
+    deprecation_version: 2
   }
   endpoint {
     name: "ifft3d"
+    deprecation_version: 2
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_PlaceholderWithDefault.pbtxt b/tensorflow/core/api_def/python_api/api_def_PlaceholderWithDefault.pbtxt
new file mode 100644
index 00000000000..a1a722c1036
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_PlaceholderWithDefault.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "PlaceholderWithDefault"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_SerializeTensor.pbtxt b/tensorflow/core/api_def/python_api/api_def_SerializeTensor.pbtxt
new file mode 100644
index 00000000000..219b125da3d
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_SerializeTensor.pbtxt
@@ -0,0 +1,10 @@
+op {
+  graph_op_name: "SerializeTensor"
+  endpoint {
+    name: "io.serialize_tensor"
+  }
+  endpoint {
+    name: "serialize_tensor"
+    deprecation_version: 2
+  }
+}
diff --git a/tensorflow/core/common_runtime/collective_param_resolver_local.cc b/tensorflow/core/common_runtime/collective_param_resolver_local.cc
index f90fb174344..3a03b6724c1 100644
--- a/tensorflow/core/common_runtime/collective_param_resolver_local.cc
+++ b/tensorflow/core/common_runtime/collective_param_resolver_local.cc
@@ -708,7 +708,16 @@ void CollectiveParamResolverLocal::CompleteInstanceSource(InstanceRec* ir,
       return;
     }
     CHECK_EQ(ir->known_count, ir->shared.group.group_size);
-    CHECK_GE(ir->source_rank, 0);
+    if (ir->source_rank < 0) {
+      // NOTE(ayushd): changing the error message below would also require
+      // updating CompleteParamsBroadcastForgotSend test in
+      // CollectiveParamResolverLocalTest.
+      ir->status =
+          errors::Internal("Instance ", cp->instance.instance_key,
+                           " found no source for broadcast.  This "
+                           "could mean that there were group_size=",
+                           ir->known_count, " BcastRecvs but no BcastSend.");
+    }
     if (!ir->known_waiters.empty()) {
       ready_waiters = std::move(ir->known_waiters);
     }
diff --git a/tensorflow/core/common_runtime/collective_param_resolver_local_test.cc b/tensorflow/core/common_runtime/collective_param_resolver_local_test.cc
index 2b43adbac69..9a501b32981 100644
--- a/tensorflow/core/common_runtime/collective_param_resolver_local_test.cc
+++ b/tensorflow/core/common_runtime/collective_param_resolver_local_test.cc
@@ -200,28 +200,35 @@ TEST_F(CollectiveParamResolverLocalTest, CompleteParamsReduction1Task) {
   }
 }
 
+void InitializeCollectiveParamsForBroadcast(int instance_key, int device_idx,
+                                            bool is_source,
+                                            CollectiveParams* cp) {
+  cp->group.group_key = 1;
+  cp->group.group_size = 3;
+  cp->group.device_type = DeviceType("CPU");
+  cp->group.num_tasks = 1;
+  cp->instance.instance_key = instance_key;
+  cp->instance.type = BROADCAST_COLLECTIVE;
+  cp->instance.data_type = DataType(DT_FLOAT);
+  cp->instance.shape = TensorShape({5});
+  cp->instance.device_names.push_back(strings::StrCat(
+      "/job:localhost/replica:0/task:0/device:CPU:", device_idx));
+  cp->instance.impl_details.subdiv_offsets.push_back(0);
+  cp->is_source = is_source;
+}
+
 TEST_F(CollectiveParamResolverLocalTest, CompleteParamsBroadcast1Task) {
+  constexpr int kInstanceKey = 5;
   CollectiveParams cps[NUM_DEVS];
   Status statuses[NUM_DEVS];
   Notification note[NUM_DEVS];
   for (int i = 0; i < NUM_DEVS; ++i) {
     CollectiveParams* cp = &cps[i];
-    cp->group.group_key = 1;
-    cp->group.group_size = 3;
-    cp->group.device_type = DeviceType("CPU");
-    cp->group.num_tasks = 1;
-    cp->instance.instance_key = 3;
-    cp->instance.type = BROADCAST_COLLECTIVE;
-    cp->instance.data_type = DataType(DT_FLOAT);
-    cp->instance.shape = TensorShape({5});
-    cp->instance.device_names.push_back(
-        strings::StrCat("/job:localhost/replica:0/task:0/device:CPU:", i));
-    cp->instance.impl_details.subdiv_offsets.push_back(0);
-    cp->is_source = (i == 1);
+    InitializeCollectiveParamsForBroadcast(kInstanceKey, i, i == 1, cp);
     Env::Default()->SchedClosure([this, i, cp, &note, &statuses]() {
       prl_->CompleteParamsAsync(cp->instance.device_names[0], cp,
                                 nullptr /*CancellationManager*/,
-                                [this, &statuses, &note, i](const Status& s) {
+                                [&statuses, &note, i](const Status& s) {
                                   statuses[i] = s;
                                   note[i].Notify();
                                 });
@@ -245,4 +252,38 @@ TEST_F(CollectiveParamResolverLocalTest, CompleteParamsBroadcast1Task) {
   }
 }
 
+// If we don't mark any participant in a broadcast as the source, we essentially
+// create a collective group with only broadcast recvs.  In that case, we should
+// get an internal error from param resolution.
+TEST_F(CollectiveParamResolverLocalTest, CompleteParamsBroadcastForgotSender) {
+  constexpr int kInstanceKey = 8;
+  CollectiveParams cps[NUM_DEVS];
+  Status statuses[NUM_DEVS];
+  Notification note[NUM_DEVS];
+  for (int i = 0; i < NUM_DEVS; ++i) {
+    CollectiveParams* cp = &cps[i];
+    InitializeCollectiveParamsForBroadcast(kInstanceKey, i, false, cp);
+    Env::Default()->SchedClosure([this, i, cp, &note, &statuses]() {
+      prl_->CompleteParamsAsync(cp->instance.device_names[0], cp,
+                                nullptr /*CancellationManager*/,
+                                [&statuses, &note, i](const Status& s) {
+                                  statuses[i] = s;
+                                  note[i].Notify();
+                                });
+    });
+  }
+  for (int i = 0; i < NUM_DEVS; ++i) {
+    note[i].WaitForNotification();
+  }
+  for (int i = 0; i < NUM_DEVS; ++i) {
+    EXPECT_EQ(statuses[i].code(), error::INTERNAL);
+    EXPECT_EQ(statuses[i].error_message(),
+              strings::StrCat(
+                  "Instance ", kInstanceKey,
+                  " found no source for broadcast.  This could mean that there"
+                  " were group_size=",
+                  NUM_DEVS, " BcastRecvs but no BcastSend."));
+  }
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/direct_session.cc b/tensorflow/core/common_runtime/direct_session.cc
index 52c1cd26918..3a23f084a5c 100644
--- a/tensorflow/core/common_runtime/direct_session.cc
+++ b/tensorflow/core/common_runtime/direct_session.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "absl/time/clock.h"
 #include "tensorflow/core/common_runtime/collective_executor_mgr.h"
 #include "tensorflow/core/common_runtime/collective_param_resolver_local.h"
 #include "tensorflow/core/common_runtime/constant_folding.h"
@@ -30,6 +31,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/common_runtime/graph_optimizer.h"
 #include "tensorflow/core/common_runtime/memory_types.h"
+#include "tensorflow/core/common_runtime/metrics.h"
 #include "tensorflow/core/common_runtime/optimization_registry.h"
 #include "tensorflow/core/common_runtime/process_util.h"
 #include "tensorflow/core/common_runtime/scoped_allocator_mgr.h"
@@ -454,6 +456,7 @@ Status DirectSession::RunInternal(int64 step_id, const RunOptions& run_options,
                                   CallFrameInterface* call_frame,
                                   ExecutorsAndKeys* executors_and_keys,
                                   RunMetadata* run_metadata) {
+  const absl::Time start_time = absl::Now();
   string session_id_meta = strings::StrCat("SessionRun #id=", step_id, "#");
   tracing::ScopedActivity activity(session_id_meta);
 
@@ -709,6 +712,7 @@ Status DirectSession::RunInternal(int64 step_id, const RunOptions& run_options,
       exec_and_lib.graph->ToGraphDef(partition_graph_def);
     }
   }
+  UpdateGraphExecutionTime(absl::Now() - start_time);
 
   return Status::OK();
 }
diff --git a/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc b/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc
index 2c63b8704ee..6a265c468c1 100644
--- a/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc
+++ b/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc
@@ -107,26 +107,20 @@ TEST(DirectSessionWithTrackingAllocTest, CostModelTest) {
         EXPECT_EQ(2, shape.dim_size());
         EXPECT_EQ(2, shape.dim(0).size());
         EXPECT_EQ(1, shape.dim(1).size());
+        // if MKL is used, it goes through additional
+        // graph rewrite pass on top of Tensorflow.
+        // In TF, every time a graph pass
+        // happens, "constant" nodes are allocated
+        // and deallocated. Each allocation calls the
+        // (FindChunkPtr of BFCAllocator),
+        // which increments the value of AllocationId.
+        // Thus AllocationId of MKL can differ with TF if
+        // someone changes the relevant codes in BFCAllocator.
+        // Currently they are the same.
         if (node->name() == y->name()) {
-#if defined(INTEL_MKL) && defined(ENABLE_MKL)
-          // if MKL is used, it goes through various additional
-          // graph rewrite pass. In TF, everytime a graph pass
-          // happens, "constant" nodes are allocated
-          // and deallocated. Each allocation calls the
-          // (FindChunkPtr of BFCAllocator),
-          // which increments the value of AllocationId.
-          // Thus AllocationId becomes more than TF if MKL
-          // is used. Now IDs for MKL are 8 more than TF.
-          EXPECT_EQ(21, cm->AllocationId(node, 0));
-#else
           EXPECT_EQ(13, cm->AllocationId(node, 0));
-#endif  // INTEL_MKL && ENABLE_MKL
         } else {
-#if defined(INTEL_MKL) && defined(ENABLE_MKL)
-          EXPECT_EQ(22, cm->AllocationId(node, 0));
-#else
           EXPECT_EQ(14, cm->AllocationId(node, 0));
-#endif  // INTEL_MKL && ENABLE_MKL
         }
       }
       EXPECT_LE(0, cm->MaxExecutionTime(node));
diff --git a/tensorflow/core/common_runtime/eager/attr_builder.cc b/tensorflow/core/common_runtime/eager/attr_builder.cc
index 29edc4e3b8f..201f06242f8 100644
--- a/tensorflow/core/common_runtime/eager/attr_builder.cc
+++ b/tensorflow/core/common_runtime/eager/attr_builder.cc
@@ -184,7 +184,7 @@ namespace {
 inline tensorflow::Fprint128 FingerprintCat128(const tensorflow::Fprint128& a,
                                                const tensorflow::Fprint128& b) {
   return {tensorflow::FingerprintCat64(a.low64, b.low64),
-          tensorflow::FingerprintCat64(a.low64, b.low64)};
+          tensorflow::FingerprintCat64(a.high64, b.high64)};
 }
 
 void CombineUnordered(const tensorflow::Fprint128& a,
diff --git a/tensorflow/core/common_runtime/eager/copy_to_device_node.h b/tensorflow/core/common_runtime/eager/copy_to_device_node.h
index 8a887540b06..5bc3bb689e0 100644
--- a/tensorflow/core/common_runtime/eager/copy_to_device_node.h
+++ b/tensorflow/core/common_runtime/eager/copy_to_device_node.h
@@ -30,7 +30,7 @@ class CopyToDeviceNode : public EagerNode {
         src_(src),
         dstd_(dstd),
         ctx_(ctx),
-        dst_(new TensorHandle(id, src_->dtype, ctx)) {
+        dst_(new TensorHandle(id, dstd_, dstd_, src->dtype, ctx)) {
     src_->Ref();
     dst_->Ref();
   }
@@ -44,13 +44,11 @@ class CopyToDeviceNode : public EagerNode {
     TensorHandle* temp = nullptr;
     TF_RETURN_IF_ERROR(src_->CopyToDevice(ctx_, dstd_, &temp));
     const Tensor* tensor = nullptr;
-    Device* device = nullptr;
-    Device* op_device = nullptr;
-    Status status = temp->TensorAndDevice(&tensor, &device, &op_device);
+    Status status = temp->Tensor(&tensor);
     // `temp` is a ready handle. So the following call should return OK.
     TF_DCHECK_OK(status) << status.error_message();
     DCHECK(tensor);
-    dst_->SetTensorAndDevice(*tensor, device, op_device);
+    dst_->SetTensor(*tensor);
     temp->Unref();
     return Status::OK();
   }
diff --git a/tensorflow/core/common_runtime/eager/execute.cc b/tensorflow/core/common_runtime/eager/execute.cc
index 53f0ba1c818..a708033c650 100644
--- a/tensorflow/core/common_runtime/eager/execute.cc
+++ b/tensorflow/core/common_runtime/eager/execute.cc
@@ -85,8 +85,7 @@ Status MaybeCopyInputToExpectedDevice(EagerOperation* op, int i,
                                       RunMetadata* run_metadata,
                                       TensorHandle** handle) {
   EagerContext* ctx = op->EagerContext();
-  Device* handle_device = nullptr;
-  TF_RETURN_IF_ERROR((*handle)->Device(&handle_device));
+  Device* handle_device = (*handle)->device();
   const Device* actual_device =
       handle_device == nullptr ? ctx->HostCPU() : handle_device;
   const Device* op_device =
@@ -193,7 +192,7 @@ Status ValidateInputTypeAndPlacement(EagerContext* ctx, Device* op_device,
 }
 
 Status SelectDevice(const NodeDef& ndef, EagerContext* ctx, Device** device) {
-  DeviceTypeVector final_devices;
+  PrioritizedDeviceTypeVector final_devices;
   TF_RETURN_IF_ERROR(SupportedDeviceTypesForNode(
       ctx->prioritized_device_type_list(), ndef, &final_devices));
   if (final_devices.empty()) {
@@ -203,7 +202,7 @@ Status SelectDevice(const NodeDef& ndef, EagerContext* ctx, Device** device) {
                             " :\n", KernelsRegisteredForOp(ndef.op()));
   }
   for (Device* d : *ctx->devices()) {
-    if (d->device_type() == final_devices[0].type_string()) {
+    if (d->device_type() == final_devices[0].first.type_string()) {
       *device = d;
       return Status::OK();
     }
@@ -334,7 +333,9 @@ Status EagerLocalExecute(EagerOperation* op,
     // TODO(agarwal): Consider executing "cheap" kernels inline for performance.
     tensorflow::uint64 id = ctx->NextId();
     for (int i = 0; i < *num_retvals; ++i) {
-      (*retvals)[i] = new TensorHandle(id, output_dtypes[i], ctx);
+      (*retvals)[i] = new TensorHandle(id, /* d= */ kernel->OutputDevice(i),
+                                       /* op_device= */ kernel->device(),
+                                       output_dtypes[i], ctx);
     }
     EagerNode* node = new ExecuteNode(
         id, ctx, op->Device(), op->Inputs(), kernel, maybe_stats.release(),
@@ -417,8 +418,7 @@ Status EagerRemoteSendTensor(EagerContext* ctx, TensorHandle* h,
   request.set_op_id(ctx->NextId());
   request.set_device_name(recv_device->name());
 
-  Device* tensor_handle_device;
-  TF_RETURN_IF_ERROR(h->Device(&tensor_handle_device));
+  Device* tensor_handle_device = h->device();
 
   // AsProtoTensorContent doesn't work when the tensor is on the GPU, hence copy
   // it to the CPU before copying it out.
@@ -485,8 +485,7 @@ Status EagerRemoteExecute(EagerOperation* op, TensorHandle** retvals,
   auto* remote_op = request->add_queue()->mutable_operation();
 
   for (int i = 0; i < op->Inputs().size(); i++) {
-    tensorflow::Device* input_device;
-    TF_RETURN_IF_ERROR(op->Inputs()[i]->Device(&input_device));
+    tensorflow::Device* input_device = op->Inputs()[i]->device();
     if (op->Device() != input_device &&
         // If the expected and actual devices are on the same task, don't
         // explicitly copy, and instead depend on the copy to happen locally
@@ -622,8 +621,7 @@ Status MaybeUpdateOpDevice(EagerOperation* op) {
       ctx->PinSmallOpsToCPU() && IsPinnableOp(op->Name());
 
   for (int i = 0; i < op->Inputs().size(); ++i) {
-    Device* input_op_device = nullptr;
-    TF_RETURN_IF_ERROR(op->Inputs()[i]->OpDevice(&input_op_device));
+    Device* input_op_device = op->Inputs()[i]->op_device();
     VLOG(2) << "for op " << op->Name() << " input " << i << " "
             << DataTypeString(op->Inputs()[i]->dtype) << " "
             << (input_op_device == nullptr ? "cpu" : input_op_device->name())
@@ -767,17 +765,19 @@ Status EagerExecute(EagerContext* ctx, Device* device,
     }
   }
   DCHECK_EQ(num_retvals, outputs.size());
-  Device* op_device = device;
   for (int i = 0; i < num_retvals; ++i) {
-    Device* d = op_device;
-    if (d != nullptr && output_memory_types != nullptr &&
-        (*output_memory_types)[i] == HOST_MEMORY) {
-      d = nullptr;
-    }
     if (retvals[i] == nullptr) {
-      retvals[i] = new TensorHandle(outputs[i], d, op_device, ctx);
+      retvals[i] =
+          new TensorHandle(outputs[i], /* d= */ kernel->OutputDevice(i),
+                           /* op_device= */ device, ctx);
     } else {
-      retvals[i]->SetTensorAndDevice(outputs[i], d, op_device);
+      // In the async case, the retval is not a nullptr, and its device is
+      // already set since all TensorHandles always have their device set during
+      // construction.
+      DCHECK_EQ(device, retvals[i]->op_device());
+      DCHECK_EQ(kernel->OutputDevice(i), retvals[i]->device());
+
+      retvals[i]->SetTensor(outputs[i]);
     }
   }
   return Status::OK();
@@ -892,8 +892,7 @@ string GetUniqueWireID() {
 
 Status EagerCopyToDevice(TensorHandle* h, EagerContext* ctx,
                          const char* device_name, TensorHandle** result) {
-  tensorflow::Device* send_device;
-  TF_RETURN_IF_ERROR(h->Device(&send_device));
+  tensorflow::Device* send_device = h->device();
 
   if (send_device == nullptr) {
     send_device = ctx->HostCPU();
diff --git a/tensorflow/core/common_runtime/eager/kernel_and_device.cc b/tensorflow/core/common_runtime/eager/kernel_and_device.cc
index e1ff45d6dd0..192d22dfd5a 100644
--- a/tensorflow/core/common_runtime/eager/kernel_and_device.cc
+++ b/tensorflow/core/common_runtime/eager/kernel_and_device.cc
@@ -131,7 +131,7 @@ Status KernelAndDevice::Run(ScopedStepContainer* step_container,
     outputs->push_back(Tensor(*context.mutable_output(i)));
   }
   if (stats != nullptr) {
-    for (const auto& allocator_pair : context.wrapped_allocators()) {
+    for (const auto& allocator_pair : context.ConsumeWrappedAllocators()) {
       AllocatorMemoryUsed* memory = stats->add_memory();
       memory->set_allocator_name(allocator_pair.first->Name());
       auto sizes = allocator_pair.second->GetSizes();
@@ -156,4 +156,12 @@ Status KernelAndDevice::Run(ScopedStepContainer* step_container,
   return Status::OK();
 }
 
+tensorflow::Device* KernelAndDevice::OutputDevice(int idx) const {
+  if (device_ != nullptr &&
+      kernel_->output_memory_types()[idx] == HOST_MEMORY) {
+    return nullptr;
+  }
+  return device_;
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/eager/kernel_and_device.h b/tensorflow/core/common_runtime/eager/kernel_and_device.h
index 4b0f5182a0e..52dac94ccca 100644
--- a/tensorflow/core/common_runtime/eager/kernel_and_device.h
+++ b/tensorflow/core/common_runtime/eager/kernel_and_device.h
@@ -69,6 +69,8 @@ class KernelAndDevice {
              std::vector<Tensor>* outputs, NodeExecStats* stats,
              StepStats* step_stats, GraphCollector* graph_collector);
 
+  Device* OutputDevice(int idx) const;
+
   const OpKernel* kernel() const { return kernel_.get(); }
 
   Device* device() const { return device_; }
diff --git a/tensorflow/core/common_runtime/eager/tensor_handle.cc b/tensorflow/core/common_runtime/eager/tensor_handle.cc
index d58724cbfac..d8d6b7a63b6 100644
--- a/tensorflow/core/common_runtime/eager/tensor_handle.cc
+++ b/tensorflow/core/common_runtime/eager/tensor_handle.cc
@@ -79,20 +79,6 @@ Status TensorHandle::Tensor(const tensorflow::Tensor** t) {
   return Status::OK();
 }
 
-Status TensorHandle::Device(tensorflow::Device** d) {
-  TF_RETURN_IF_ERROR(WaitReady());
-  DCHECK(IsReady());
-  *d = device_;
-  return Status::OK();
-}
-
-Status TensorHandle::OpDevice(tensorflow::Device** d) {
-  TF_RETURN_IF_ERROR(WaitReady());
-  DCHECK(IsReady());
-  *d = op_device_;
-  return Status::OK();
-}
-
 Status TensorHandle::TensorAndDevice(const tensorflow::Tensor** tensor,
                                      tensorflow::Device** device,
                                      tensorflow::Device** op_device) {
@@ -178,17 +164,12 @@ Status TensorHandle::RemoteAddress(int64* op_id, int32* output_num) {
   return Status::OK();
 }
 
-void TensorHandle::SetTensorAndDevice(const tensorflow::Tensor& tensor,
-                                      tensorflow::Device* device,
-                                      tensorflow::Device* op_device) {
+void TensorHandle::SetTensor(const tensorflow::Tensor& tensor) {
   mutex_lock l(ctx_mutex_);
-  DCHECK(node_id_ > 0 && !is_ready_)
-      << "SetTensorAndDevice should be only called  "
-      << "on non-ready handles.";
+  DCHECK(node_id_ > 0 && !is_ready_) << "SetTensor should be only called  "
+                                     << "on non-ready handles.";
   is_ready_ = true;
   tensor_ = tensor;
-  device_ = device;
-  op_device_ = op_device;
 }
 
 Status TensorHandle::CopyToDevice(EagerContext* ctx, tensorflow::Device* dstd,
diff --git a/tensorflow/core/common_runtime/eager/tensor_handle.h b/tensorflow/core/common_runtime/eager/tensor_handle.h
index e55f1a03385..0fdd31ab5fc 100644
--- a/tensorflow/core/common_runtime/eager/tensor_handle.h
+++ b/tensorflow/core/common_runtime/eager/tensor_handle.h
@@ -61,12 +61,13 @@ class TensorHandle : public core::RefCounted {
         ctx_(ctx),
         is_ready_(true) {}
 
-  TensorHandle(uint64 node_id, DataType dtype, EagerContext* ctx)
+  TensorHandle(uint64 node_id, Device* d, Device* op_device, DataType dtype,
+               EagerContext* ctx)
       : dtype(dtype),
         node_id_(node_id),
         tensor_(dtype),
-        device_(nullptr),
-        op_device_(nullptr),
+        device_(d),
+        op_device_(op_device),
         remote_op_id_(-1),
         remote_output_num_(-1),
         remote_shape_node_id_(-1),
@@ -101,9 +102,9 @@ class TensorHandle : public core::RefCounted {
 
   Status Tensor(const tensorflow::Tensor** t);
 
-  Status Device(tensorflow::Device** d);
+  tensorflow::Device* device() const { return device_; }
 
-  Status OpDevice(tensorflow::Device** d);
+  tensorflow::Device* op_device() const { return op_device_; }
 
   Status TensorAndDevice(const tensorflow::Tensor** tensor,
                          tensorflow::Device** device,
@@ -120,9 +121,7 @@ class TensorHandle : public core::RefCounted {
 
   // Note that this can be called at most once, and only on non-ready handles,
   // and makes them ready.
-  void SetTensorAndDevice(const tensorflow::Tensor& tensor,
-                          tensorflow::Device* device,
-                          tensorflow::Device* op_device);
+  void SetTensor(const tensorflow::Tensor& tensor);
 
   Status CopyToDevice(EagerContext* ctx, tensorflow::Device* dstd,
                       TensorHandle** output);
@@ -172,11 +171,11 @@ class TensorHandle : public core::RefCounted {
   //
   // TODO(ashankar): Reference count TFE_Context to ensure that 'device_' of a
   // TFE_TensorHandle does not outlive the TFE_Context from which it came?
-  tensorflow::Device* device_;
+  tensorflow::Device* const device_;
 
   // Device in which the op producing this tensor was executed. Equals to
   // device_ for constant tensors.
-  tensorflow::Device* op_device_;
+  tensorflow::Device* const op_device_;
 
   // IDs required when this class is representing a remote tensor handle.
   const int64 remote_op_id_;
diff --git a/tensorflow/core/common_runtime/function.cc b/tensorflow/core/common_runtime/function.cc
index 286386e04c2..7eb622dc117 100644
--- a/tensorflow/core/common_runtime/function.cc
+++ b/tensorflow/core/common_runtime/function.cc
@@ -46,7 +46,11 @@ namespace tensorflow {
 
 // A few string constant used throughout this module.
 static constexpr const char* const kArgOp = FunctionLibraryDefinition::kArgOp;
+static constexpr const char* const kDeviceArgOp =
+    FunctionLibraryDefinition::kDeviceArgOp;
 static constexpr const char* const kRetOp = FunctionLibraryDefinition::kRetOp;
+static constexpr const char* const kDeviceRetOp =
+    FunctionLibraryDefinition::kDeviceRetOp;
 static constexpr const char* const kGradientOp =
     FunctionLibraryDefinition::kGradientOp;
 static constexpr const char* const kNodeLabel = "Func";
@@ -382,8 +386,8 @@ class FunctionLibraryRuntimeImpl : public FunctionLibraryRuntime {
   Status FunctionDefToBody(const FunctionDef& fdef, AttrSlice attrs,
                            const FunctionLibraryDefinition* lib_def,
                            FunctionBody** fbody);
-  Status CreateItem(Handle handle, Item** item);
-  Status GetOrCreateItem(Handle handle, Item** item);
+  Status CreateItem(Item** item);
+  Status GetOrCreateItem(LocalHandle local_handle, Item** item);
   Status InstantiateSymbolicGradient(const NameAttrList& func,
                                      const FunctionLibraryDefinition* lib_def,
                                      FunctionBody** g_body);
@@ -391,7 +395,11 @@ class FunctionLibraryRuntimeImpl : public FunctionLibraryRuntime {
   AttrValueMap FixAttrs(const AttrSlice& attrs);
   void RunRemote(const Options& opts, Handle handle,
                  gtl::ArraySlice<Tensor> args, std::vector<Tensor>* rets,
-                 Executor::Args* exec_args, Item* item, DoneCallback done);
+                 Item* item, DoneCallback done);
+
+  void ExecutorArgsFromOptions(const FunctionLibraryRuntime::Options& run_opts,
+                               CallFrameInterface* frame,
+                               Executor::Args* exec_args);
 
   TF_DISALLOW_COPY_AND_ASSIGN(FunctionLibraryRuntimeImpl);
 };
@@ -687,13 +695,14 @@ Status FunctionLibraryRuntimeImpl::Instantiate(
     TF_RETURN_IF_ERROR(FunctionDefToBody(*fdef, attrs, lib_def, &fbody));
   }
 
+  LocalHandle local_handle;
   {
     mutex_lock l(mu_);
     *handle = parent_->GetHandle(key);
     if (*handle != kInvalidHandle) {
       delete fbody;
-      ++items_[parent_->GetHandleOnDevice(device_name_, *handle)]
-            ->instantiation_counter;
+      local_handle = parent_->GetHandleOnDevice(device_name_, *handle);
+      ++items_[local_handle]->instantiation_counter;
     } else {
       *handle = parent_->AddHandle(key, device_name_, next_handle_);
       Item* item = new Item;
@@ -705,26 +714,24 @@ Status FunctionLibraryRuntimeImpl::Instantiate(
         item->overlay_flr =
             new FunctionLibraryRuntimeOverlay(this, options.overlay_lib);
       }
-      items_.emplace(next_handle_, std::unique_ptr<Item>(item));
-      next_handle_++;
+      local_handle = next_handle_++;
+      items_.emplace(local_handle, std::unique_ptr<Item>(item));
     }
   }
 
   if (options.create_kernels_eagerly) {
     Item* item;
-    TF_RETURN_IF_ERROR(GetOrCreateItem(*handle, &item));
+    TF_RETURN_IF_ERROR(GetOrCreateItem(local_handle, &item));
   }
 
   return Status::OK();
 }
 
 Status FunctionLibraryRuntimeImpl::ReleaseHandle(Handle handle) {
-  if (!parent_->IsInstantiatedOnDevice(device_name_, handle)) {
+  LocalHandle h = parent_->GetHandleOnDevice(device_name_, handle);
+  if (h == kInvalidLocalHandle) {
     return parent_->ReleaseHandle(handle);
   }
-
-  LocalHandle h = parent_->GetHandleOnDevice(device_name_, handle);
-  CHECK_NE(h, kInvalidLocalHandle);
   mutex_lock l(mu_);
   CHECK_EQ(1, items_.count(h));
   std::unique_ptr<Item>& item = items_[h];
@@ -785,7 +792,7 @@ void PruneFunctionBody(Graph* g) {
 }
 }  // namespace
 
-Status FunctionLibraryRuntimeImpl::CreateItem(Handle handle, Item** item) {
+Status FunctionLibraryRuntimeImpl::CreateItem(Item** item) {
   const FunctionBody* fbody;
   const FunctionLibraryDefinition* lib_def;
   string executor_type;
@@ -839,13 +846,13 @@ Status FunctionLibraryRuntimeImpl::CreateItem(Handle handle, Item** item) {
   return Status::OK();
 }
 
-Status FunctionLibraryRuntimeImpl::GetOrCreateItem(Handle handle, Item** item) {
-  LocalHandle local_handle = parent_->GetHandleOnDevice(device_name_, handle);
+Status FunctionLibraryRuntimeImpl::GetOrCreateItem(LocalHandle local_handle,
+                                                   Item** item) {
   {
     tf_shared_lock l(mu_);
     auto iter = items_.find(local_handle);
     if (iter == items_.end()) {
-      return errors::NotFound("Function handle ", handle,
+      return errors::Internal("Local function handle ", local_handle,
                               " is not valid. Likely an internal error.");
     }
     *item = iter->second.get();
@@ -855,22 +862,37 @@ Status FunctionLibraryRuntimeImpl::GetOrCreateItem(Handle handle, Item** item) {
   }
   // NOTE: We need to call CreateItem out of mu_ because creating an
   // executor needs to call CreateKernel.
-  return CreateItem(handle, item);
+  return CreateItem(item);
+}
+
+void FunctionLibraryRuntimeImpl::ExecutorArgsFromOptions(
+    const FunctionLibraryRuntime::Options& run_opts, CallFrameInterface* frame,
+    Executor::Args* exec_args) {
+  // Inherit the step_id from the caller.
+  exec_args->step_id = run_opts.step_id;
+  exec_args->rendezvous = run_opts.rendezvous;
+  exec_args->stats_collector = run_opts.stats_collector;
+  exec_args->cancellation_manager = run_opts.cancellation_manager;
+  exec_args->step_container = run_opts.step_container;
+  if (run_opts.runner) {
+    exec_args->runner = *run_opts.runner;
+  } else {
+    exec_args->runner = default_runner_;
+  }
+  exec_args->collective_executor = run_opts.collective_executor;
+  exec_args->call_frame = frame;
 }
 
 void FunctionLibraryRuntimeImpl::RunRemote(const Options& opts, Handle handle,
                                            gtl::ArraySlice<Tensor> args,
                                            std::vector<Tensor>* rets,
-                                           Executor::Args* exec_args,
                                            Item* item, DoneCallback done) {
-  DCHECK(exec_args->call_frame == nullptr);
   string target_device = parent_->GetDeviceName(handle);
   string source_device = opts.source_device;
   Rendezvous* rendezvous = opts.rendezvous;
   DeviceContext* device_context;
   Status s = parent_->GetDeviceContext(target_device, &device_context);
   if (!s.ok()) {
-    delete exec_args;
     done(s);
     return;
   }
@@ -878,7 +900,6 @@ void FunctionLibraryRuntimeImpl::RunRemote(const Options& opts, Handle handle,
   s = parent_->GetDeviceIncarnation(source_device, &src_incarnation);
   s.Update(parent_->GetDeviceIncarnation(target_device, &target_incarnation));
   if (!s.ok()) {
-    delete exec_args;
     done(s);
     return;
   }
@@ -886,13 +907,8 @@ void FunctionLibraryRuntimeImpl::RunRemote(const Options& opts, Handle handle,
   const FunctionBody* fbody = GetFunctionBody(handle);
   FunctionCallFrame* frame =
       new FunctionCallFrame(fbody->arg_types, fbody->ret_types);
-  exec_args->call_frame = frame;
-  if (!s.ok()) {
-    delete frame;
-    delete exec_args;
-    done(s);
-    return;
-  }
+  Executor::Args* exec_args = new Executor::Args;
+  ExecutorArgsFromOptions(opts, frame, exec_args);
 
   std::vector<AllocatorAttributes> args_alloc_attrs, rets_alloc_attrs;
   args_alloc_attrs.reserve(fbody->arg_types.size());
@@ -938,10 +954,10 @@ void FunctionLibraryRuntimeImpl::RunRemote(const Options& opts, Handle handle,
           return;
         }
         item->exec->RunAsync(
-            *exec_args, [frame, rets, done, source_device, target_device,
-                         target_incarnation, rendezvous, device_context,
-                         remote_args, exec_args, rets_alloc_attrs,
-                         allow_dead_tensors](const Status& status) {
+            *exec_args,
+            [frame, rets, done, source_device, target_device,
+             target_incarnation, rendezvous, device_context, remote_args,
+             rets_alloc_attrs, allow_dead_tensors](const Status& status) {
               Status s = status;
               if (s.ok()) {
                 s = frame->ConsumeRetvals(rets, allow_dead_tensors);
@@ -949,7 +965,6 @@ void FunctionLibraryRuntimeImpl::RunRemote(const Options& opts, Handle handle,
               delete frame;
               if (!s.ok()) {
                 delete remote_args;
-                delete exec_args;
                 done(s);
                 return;
               }
@@ -957,9 +972,9 @@ void FunctionLibraryRuntimeImpl::RunRemote(const Options& opts, Handle handle,
                   target_device, source_device, "ret_", target_incarnation,
                   *rets, device_context, rets_alloc_attrs, rendezvous);
               delete remote_args;
-              delete exec_args;
               done(s);
             });
+        delete exec_args;
       });
 }
 
@@ -982,7 +997,8 @@ void FunctionLibraryRuntimeImpl::Run(const Options& opts, Handle handle,
     };
   }
 
-  if (!parent_->IsInstantiatedOnDevice(device_name_, handle)) {
+  LocalHandle local_handle = parent_->GetHandleOnDevice(device_name_, handle);
+  if (local_handle == kInvalidLocalHandle) {
     parent_->Run(run_opts, handle, args, rets, done);
     return;
   }
@@ -992,54 +1008,43 @@ void FunctionLibraryRuntimeImpl::Run(const Options& opts, Handle handle,
   }
   DCHECK(run_opts.runner != nullptr);
 
-  Executor::Args* exec_args = new Executor::Args;
-  // Inherit the step_id from the caller.
-  exec_args->step_id = run_opts.step_id;
-  exec_args->rendezvous = run_opts.rendezvous;
-  exec_args->stats_collector = run_opts.stats_collector;
-  exec_args->cancellation_manager = run_opts.cancellation_manager;
-  exec_args->step_container = run_opts.step_container;
-  exec_args->runner = *run_opts.runner;
-  exec_args->collective_executor = run_opts.collective_executor;
-
   Item* item = nullptr;
-  Status s = GetOrCreateItem(handle, &item);
+  Status s = GetOrCreateItem(local_handle, &item);
   if (!s.ok()) {
-    delete exec_args;
     done(s);
     return;
   }
 
   if (run_opts.remote_execution) {
     // NOTE(mrry): `RunRemote()` will set `exec_args->call_frame` for us.
-    RunRemote(run_opts, handle, args, rets, exec_args, item, done);
+    RunRemote(run_opts, handle, args, rets, item, done);
     return;
   }
 
   const FunctionBody* fbody = GetFunctionBody(handle);
   FunctionCallFrame* frame =
       new FunctionCallFrame(fbody->arg_types, fbody->ret_types);
-  exec_args->call_frame = frame;
   s = frame->SetArgs(args);
   if (!s.ok()) {
     delete frame;
-    delete exec_args;
     done(s);
     return;
   }
 
-  bool allow_dead_tensors = opts.allow_dead_tensors;
+  Executor::Args exec_args;
+  ExecutorArgsFromOptions(run_opts, frame, &exec_args);
+
+  bool allow_dead_tensors = run_opts.allow_dead_tensors;
   item->exec->RunAsync(
       // Executor args
-      *exec_args,
+      exec_args,
       // Done callback.
-      [frame, rets, done, exec_args, allow_dead_tensors](const Status& status) {
+      [frame, rets, done, allow_dead_tensors](const Status& status) {
         Status s = status;
         if (s.ok()) {
           s = frame->ConsumeRetvals(rets, allow_dead_tensors);
         }
         delete frame;
-        delete exec_args;
         done(s);
       });
 }
@@ -1051,8 +1056,8 @@ void FunctionLibraryRuntimeImpl::Run(const Options& opts, Handle handle,
     done(errors::Cancelled(""));
     return;
   }
-  if (!parent_->IsInstantiatedOnDevice(device_name_, handle) ||
-      opts.remote_execution) {
+  LocalHandle local_handle = parent_->GetHandleOnDevice(device_name_, handle);
+  if (local_handle == kInvalidLocalHandle || opts.remote_execution) {
     done(errors::Unimplemented("Remote calling with CallFrameInterface"));
     return;
   }
@@ -1073,7 +1078,7 @@ void FunctionLibraryRuntimeImpl::Run(const Options& opts, Handle handle,
   }
 
   Item* item = nullptr;
-  Status s = GetOrCreateItem(handle, &item);
+  Status s = GetOrCreateItem(local_handle, &item);
   if (!s.ok()) {
     done(s);
     return;
@@ -1084,16 +1089,7 @@ void FunctionLibraryRuntimeImpl::Run(const Options& opts, Handle handle,
   DCHECK(run_opts.runner != nullptr);
 
   Executor::Args exec_args;
-  // Inherit the step_id from the caller.
-  exec_args.step_id = run_opts.step_id;
-  exec_args.rendezvous = run_opts.rendezvous;
-  exec_args.stats_collector = run_opts.stats_collector;
-  exec_args.cancellation_manager = run_opts.cancellation_manager;
-  exec_args.collective_executor = run_opts.collective_executor;
-  exec_args.step_container = run_opts.step_container;
-  exec_args.runner = *run_opts.runner;
-  exec_args.call_frame = frame;
-
+  ExecutorArgsFromOptions(run_opts, frame, &exec_args);
   item->exec->RunAsync(exec_args, std::move(done));
 }
 
@@ -1105,7 +1101,8 @@ bool FunctionLibraryRuntimeImpl::IsStateful(const string& func) {
 
 string FunctionLibraryRuntimeImpl::DebugString(Handle handle) {
   Item* item = nullptr;
-  Status s = GetOrCreateItem(handle, &item);
+  LocalHandle local_handle = parent_->GetHandleOnDevice(device_name_, handle);
+  Status s = GetOrCreateItem(local_handle, &item);
   if (s.ok()) {
     return tensorflow::DebugString(item->graph);
   } else {
@@ -1640,9 +1637,9 @@ FunctionBody::FunctionBody(const FunctionDef& f, DataTypeSlice arg_t,
   this->ret_nodes.resize(ret_types.size());
   for (Node* n : this->graph->op_nodes()) {
     gtl::InlinedVector<Node*, 4>* node_vec;
-    if (n->type_string() == kRetOp) {
+    if (n->type_string() == kRetOp || n->type_string() == kDeviceRetOp) {
       node_vec = &this->ret_nodes;
-    } else if (n->type_string() == kArgOp) {
+    } else if (n->type_string() == kArgOp || n->type_string() == kDeviceArgOp) {
       node_vec = &this->arg_nodes;
     } else {
       continue;
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.cc b/tensorflow/core/common_runtime/gpu/gpu_device.cc
index d8ebdeff5d2..81fea311e13 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_device.cc
@@ -84,13 +84,13 @@ namespace tensorflow {
 // corresponding stream have completed.  The following two classes
 // serve this purpose in two different compilation environments.
 
-class EigenCudaStreamDevice : public ::Eigen::StreamInterface {
+class EigenGpuStreamDevice : public ::Eigen::StreamInterface {
  public:
-  EigenCudaStreamDevice()
+  EigenGpuStreamDevice()
       : scratch_(nullptr), semaphore_(nullptr), context_(nullptr) {
     Eigen::initializeDeviceProp();
   }
-  ~EigenCudaStreamDevice() override {}
+  ~EigenGpuStreamDevice() override {}
   void Reinitialize(OpKernelContext* context, const cudaStream_t* cuda_stream,
                     TfGpuId tf_gpu_id, ::tensorflow::Allocator* alloc,
                     char* scratch) {
@@ -101,7 +101,7 @@ class EigenCudaStreamDevice : public ::Eigen::StreamInterface {
     context_ = context;
     scratch_ = scratch;
     semaphore_ =
-        reinterpret_cast<unsigned int*>(scratch + Eigen::kCudaScratchSize);
+        reinterpret_cast<unsigned int*>(scratch + Eigen::kGpuScratchSize);
     stream_ = cuda_stream;
     allocator_ = alloc;
     PlatformGpuId platform_gpu_id;
@@ -185,7 +185,7 @@ class EigenCudaStreamDevice : public ::Eigen::StreamInterface {
   mutable unsigned int* semaphore_;
   OpKernelContext* context_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(EigenCudaStreamDevice);
+  TF_DISALLOW_COPY_AND_ASSIGN(EigenGpuStreamDevice);
 };
 
 // This factory helps to ensure that different GPU device objects that refer to
@@ -292,7 +292,7 @@ Status BaseGPUDevice::InitScratchBuffers() {
       DCHECK(streams_[i]);
       if (scratch_.size() > i && scratch_[i]) continue;
       size_t scratch_buffer_size =
-          Eigen::kCudaScratchSize + sizeof(unsigned int);
+          Eigen::kGpuScratchSize + sizeof(unsigned int);
       void* scratch_buffer = gpu_allocator_->AllocateRaw(
           Allocator::kAllocatorAlignment, scratch_buffer_size);
       if (scratch_buffer == nullptr) {
@@ -304,7 +304,7 @@ Status BaseGPUDevice::InitScratchBuffers() {
           se::DeviceMemoryBase(scratch_buffer, scratch_buffer_size));
 
       bool ok = executor_->SynchronousMemZero(
-          &mem, Eigen::kCudaScratchSize + sizeof(unsigned int));
+          &mem, Eigen::kGpuScratchSize + sizeof(unsigned int));
       if (!ok) {
         return errors::FailedPrecondition(
             "Failed to memcopy into scratch buffer for device ",
@@ -692,7 +692,7 @@ class ConcretePerOpGpuDevice : public PerOpGpuDevice {
   const Eigen::GpuDevice& device() const override { return device_; }
 
  private:
-  EigenCudaStreamDevice stream_device_;
+  EigenGpuStreamDevice stream_device_;
   Eigen::GpuDevice device_;
 };
 
@@ -1169,6 +1169,7 @@ Status BaseGPUDeviceFactory::GetDeviceLocalities(
     int num_tf_gpus, const std::vector<InterconnectMap>& interconnects,
     LocalityMap* localities) {
   std::vector<TfGpuId> all_tf_gpu_ids;
+  all_tf_gpu_ids.reserve(num_tf_gpus);
   for (int i = 0; i < num_tf_gpus; ++i) {
     all_tf_gpu_ids.push_back(TfGpuId(i));
   }
diff --git a/tensorflow/core/common_runtime/metrics.cc b/tensorflow/core/common_runtime/metrics.cc
new file mode 100644
index 00000000000..2736739b1a5
--- /dev/null
+++ b/tensorflow/core/common_runtime/metrics.cc
@@ -0,0 +1,42 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/common_runtime/metrics.h"
+
+#include "tensorflow/core/lib/monitoring/counter.h"
+
+namespace tensorflow {
+
+namespace {
+
+auto* graph_runs = monitoring::Counter<0>::New(
+    "/tensorflow/core/graph_runs",
+    "The number of graph executions used to collect "
+    "/tensorflow/core/graph_run_time_msecs");
+
+auto* graph_run_time_msecs = monitoring::Counter<0>::New(
+    "/tensorflow/core/graph_run_time_msecs",
+    "The total time spent on executing graphs in milliseconds.");
+}  // namespace
+
+void UpdateGraphExecutionTime(const absl::Duration running_time) {
+  if (running_time > absl::ZeroDuration()) {
+    graph_runs->GetCell()->IncrementBy(1);
+    graph_run_time_msecs->GetCell()->IncrementBy(running_time /
+                                                 absl::Milliseconds(1));
+  }
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/metrics.h b/tensorflow/core/common_runtime/metrics.h
new file mode 100644
index 00000000000..5dd4caf5b73
--- /dev/null
+++ b/tensorflow/core/common_runtime/metrics.h
@@ -0,0 +1,27 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_METRICS_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_METRICS_H_
+
+#include "absl/time/time.h"
+
+namespace tensorflow {
+
+void UpdateGraphExecutionTime(const absl::Duration time);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_METRICS_H_
diff --git a/tensorflow/core/common_runtime/placer.cc b/tensorflow/core/common_runtime/placer.cc
index 305d6a3b1bd..01e4072f603 100644
--- a/tensorflow/core/common_runtime/placer.cc
+++ b/tensorflow/core/common_runtime/placer.cc
@@ -32,6 +32,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/util/port.h"
 
 namespace tensorflow {
 
@@ -46,42 +47,51 @@ const StringPiece kColocationGroupPrefixStringPiece(kColocationGroupPrefix);
 // returned list is sorted by preferred type (higher numeric type is preferred).
 std::vector<Device*> FilterSupportedDevices(
     const std::vector<Device*>& devices,
-    const DeviceTypeVector& supported_device_types,
+    const PrioritizedDeviceTypeVector& supported_device_types,
     const Device* default_device) {
   Device* filtered_default_device = nullptr;
-  std::vector<Device*> filtered_devices;
-  for (const DeviceType& d : supported_device_types) {
+  std::vector<std::pair<Device*, int32>> prioritized_filtered_devices;
+  for (const auto& supported_device_type : supported_device_types) {
     for (Device* device : devices) {
-      if (DeviceType(device->attributes().device_type()) == d) {
+      if (DeviceType(device->attributes().device_type()) ==
+          supported_device_type.first) {
         if (device == default_device) {
           filtered_default_device = device;
         } else {
-          filtered_devices.emplace_back(device);
+          prioritized_filtered_devices.emplace_back(
+              device, supported_device_type.second);
         }
       }
     }
   }
 
-  auto device_sort = [](const Device* a, const Device* b) {
-    auto a_priority = DeviceSet::DeviceTypeOrder(DeviceType(a->device_type()));
-    auto b_priority = DeviceSet::DeviceTypeOrder(DeviceType(b->device_type()));
+  auto device_sort = [](const std::pair<Device*, int32>& a,
+                        const std::pair<Device*, int32>& b) {
+    if (a.second != b.second) {
+      return a.second > b.second;
+    }
+
+    auto a_priority =
+        DeviceSet::DeviceTypeOrder(DeviceType(a.first->device_type()));
+    auto b_priority =
+        DeviceSet::DeviceTypeOrder(DeviceType(b.first->device_type()));
     // First sort by prioritized device type (higher is preferred) and
     // then by device name (lexicographically).
     if (a_priority != b_priority) {
       return a_priority > b_priority;
     }
-    return StringPiece(a->name()) < StringPiece(b->name());
+    return StringPiece(a.first->name()) < StringPiece(b.first->name());
   };
-  std::vector<Device*>::iterator sort_start;
+  std::sort(prioritized_filtered_devices.begin(),
+            prioritized_filtered_devices.end(), device_sort);
+
+  std::vector<Device*> filtered_devices;
   if (filtered_default_device != nullptr) {
-    // Put the default device first outside of the normal ordering.
     filtered_devices.emplace_back(filtered_default_device);
-    std::iter_swap(filtered_devices.begin(), std::prev(filtered_devices.end()));
-    sort_start = std::next(filtered_devices.begin());
-  } else {
-    sort_start = filtered_devices.begin();
   }
-  std::sort(sort_start, filtered_devices.end(), device_sort);
+  for (const auto& prioritized_filtered_device : prioritized_filtered_devices) {
+    filtered_devices.push_back(prioritized_filtered_device.first);
+  }
   return filtered_devices;
 }
 
@@ -378,11 +388,20 @@ class ColocationGraph {
             }
             std::sort(device_names.begin(), device_names.end());
 
+            string gpu_msg = "";
+            if (!IsGoogleCudaEnabled() &&
+                str_util::Lowercase(specified_device_name.type) == "gpu") {
+              gpu_msg =
+                  " The requested device appears to be a GPU, but CUDA is not "
+                  "enabled.";
+            }
+
             return errors::InvalidArgument(
-                "Operation was explicitly assigned to ",
-                node->requested_device(), " but available devices are [ ",
+                errors::FormatNodeNameForError(node->name()),
+                "was explicitly assigned to ", node->requested_device(),
+                " but available devices are [ ",
                 str_util::Join(device_names, ", "), " ]. Make sure ",
-                "the device specification refers to a valid device.");
+                "the device specification refers to a valid device.", gpu_msg);
           } else if (specified_device_name.has_type) {
             return errors::InvalidArgument(
                 "Could not satisfy explicit device specification '",
@@ -462,7 +481,7 @@ class ColocationGraph {
     // The intersection of all device types supported by this node,
     // and those of all of its children, in priority order
     // of the preferred device.
-    DeviceTypeVector supported_device_types;
+    PrioritizedDeviceTypeVector supported_device_types;
 
     // The merged form of the device requested for this node, with
     // those of all of its children.
@@ -501,8 +520,8 @@ class ColocationGraph {
       const string& op_type = node->type_string();
       string devices_registered;
       for (const auto& device_type : members_[id].supported_device_types) {
-        strings::StrAppend(&devices_registered, DeviceTypeString(device_type),
-                           " ");
+        strings::StrAppend(&devices_registered,
+                           DeviceTypeString(device_type.first), " ");
       }
 
       type_to_devices[op_type] = std::move(devices_registered);
@@ -555,8 +574,9 @@ class ColocationGraph {
                                 "' does not match any device");
       }
 
-      for (const DeviceType& d : member->supported_device_types) {
-        if (DeviceType(assigned_device->attributes().device_type()) == d) {
+      for (const auto& d : member->supported_device_types) {
+        if (DeviceType(assigned_device->attributes().device_type()) ==
+            d.first) {
           return Status::OK();
         }
       }
@@ -613,24 +633,102 @@ class ColocationGraph {
     return Status::OK();
   }
 
+  static bool HasPriorities(const PrioritizedDeviceTypeVector& device_types) {
+    for (const auto& prioritized_device_type : device_types) {
+      if (prioritized_device_type.second != 0) return true;
+    }
+    return false;
+  }
+
+  static bool ArePrioritiesSame(const PrioritizedDeviceTypeVector& a_types,
+                                const PrioritizedDeviceTypeVector& b_types) {
+    if (a_types.size() != b_types.size()) {
+      return false;
+    }
+    for (int i = 0; i < a_types.size(); ++i) {
+      if (a_types[i].first != b_types[i].first) {
+        return false;
+      }
+    }
+    return true;
+  }
+
   // Updates target to contain the intersection of the device types in
   // "target" and "other".
-  static void MergeSupportedDevices(DeviceTypeVector* target,
-                                    const DeviceTypeVector& other) {
-    DeviceTypeVector temp = *target;
+  static void MergeSupportedDevices(PrioritizedDeviceTypeVector* target,
+                                    const PrioritizedDeviceTypeVector& other) {
+    PrioritizedDeviceTypeVector temp = *target;
     target->clear();
 
-    // Iterate in priority order.
-    for (const DeviceType& device_type : temp) {
+    // Generate intersection with priorities.
+    PrioritizedDeviceTypeVector target_intersection;
+    PrioritizedDeviceTypeVector other_intersection;
+    for (const auto& prioritized_device_type : temp) {
       bool found = false;
-      for (const DeviceType& other_device_type : other) {
-        if (device_type == other_device_type) {
+      for (const auto& other_prioritized_device_type : other) {
+        if (prioritized_device_type.first ==
+            other_prioritized_device_type.first) {
           found = true;
+          other_intersection.push_back(other_prioritized_device_type);
           break;
         }
       }
       if (found) {
-        target->push_back(device_type);
+        target_intersection.push_back(prioritized_device_type);
+      }
+    }
+
+    // Sort the devices by priority order.
+    auto device_sort = [](const std::pair<DeviceType, int32>& a,
+                          const std::pair<DeviceType, int32>& b) {
+      // First look at set priorities.
+      if (a.second != b.second) {
+        return a.second > b.second;
+      }
+      // Then fallback to default priorities.
+      auto a_priority = DeviceSet::DeviceTypeOrder(a.first);
+      auto b_priority = DeviceSet::DeviceTypeOrder(b.first);
+      if (a_priority != b_priority) {
+        return a_priority > b_priority;
+      }
+      // Finally just look at the Device type strings.
+      return a.first.type_string() < b.first.type_string();
+    };
+
+    std::sort(target_intersection.begin(), target_intersection.end(),
+              device_sort);
+    std::sort(other_intersection.begin(), other_intersection.end(),
+              device_sort);
+
+    bool is_target_prioritized = HasPriorities(target_intersection);
+    bool is_other_prioritized = HasPriorities(other_intersection);
+    // If neither are prioritized then we just return the original i.e. target
+    // prioritization.
+    if (!is_target_prioritized && !is_other_prioritized) {
+      *target = target_intersection;
+    }
+    // If only one is prioritized, then we respect priorities of that in the
+    // intersection.
+    if (is_target_prioritized && !is_other_prioritized) {
+      *target = target_intersection;
+    }
+    if (!is_target_prioritized && is_other_prioritized) {
+      *target = other_intersection;
+    }
+    // If both have priorities and agree then we go with that. If the
+    // prioritization order is different, then we just fallback to the default
+    // i.e. what the DeviceTypeOrder suggests. In that case, we also set the
+    // merged priorities to 0, so that downstream merges work correctly as well.
+    if (is_target_prioritized && is_other_prioritized) {
+      bool priorities_agree =
+          ArePrioritiesSame(target_intersection, other_intersection);
+      if (priorities_agree) {
+        *target = target_intersection;
+      } else {
+        for (const auto& prioritized_device : target_intersection) {
+          target->push_back(std::make_pair(prioritized_device.first, 0));
+        }
+        std::sort(target->begin(), target->end(), device_sort);
       }
     }
   }
diff --git a/tensorflow/core/common_runtime/placer_test.cc b/tensorflow/core/common_runtime/placer_test.cc
index d5e98b8d9e8..009f905f108 100644
--- a/tensorflow/core/common_runtime/placer_test.cc
+++ b/tensorflow/core/common_runtime/placer_test.cc
@@ -164,6 +164,13 @@ REGISTER_KERNEL_BUILDER(Name("TestDeviceEnforce").Device("FakeGPU"), DummyOp);
 REGISTER_KERNEL_BUILDER(Name("Shape").Device("FakeCPU"), DummyOp);
 REGISTER_KERNEL_BUILDER(Name("Shape").Device("FakeGPU"), DummyOp);
 
+// Op that has kernels with device priorities specified.
+REGISTER_OP("TestDatasetOp").Input("a: float").Output("b: float");
+REGISTER_KERNEL_BUILDER(Name("TestDatasetOp").Device("FakeCPU").Priority(2),
+                        DummyOp);
+REGISTER_KERNEL_BUILDER(Name("TestDatasetOp").Device("FakeGPU").Priority(1),
+                        DummyOp);
+
 ////////////////////////////////////////////////////////////////////////////////
 //
 // A PlacerTest method has three phases:
@@ -285,6 +292,251 @@ TEST_F(PlacerTest, TestNoConstraints) {
   EXPECT_DEVICE_TYPE(g, "n2", "FakeGPU");
 }
 
+// Test that a graph with no constraints but using kernels that have a specified
+// device priority will successfully assign nodes to the device with higher
+// priority
+TEST_F(PlacerTest, TestNoConstraintsWithPrioritizedKernels) {
+  Graph g(OpRegistry::Global());
+  {  // Scope for temporary variables used to construct g.
+    GraphDefBuilder b(GraphDefBuilder::kFailImmediately);
+    Node* input = ops::SourceOp("TestInput", b.opts().WithName("in"));
+    ops::UnaryOp("TestDatasetOp", ops::NodeOut(input, 0),
+                 b.opts().WithName("n1"));
+    ops::UnaryOp("TestDatasetOp", ops::NodeOut(input, 1),
+                 b.opts().WithName("n2"));
+    TF_EXPECT_OK(BuildGraph(b, &g));
+  }
+
+  TF_EXPECT_OK(Place(&g));
+  EXPECT_DEVICE_TYPE(g, "in", "FakeCPU");
+  EXPECT_DEVICE_TYPE(g, "n1", "FakeCPU");
+  EXPECT_DEVICE_TYPE(g, "n2", "FakeCPU");
+}
+
+TEST_F(PlacerTest, TestGPUInputIntoPrioritizedKernel) {
+  Graph g(OpRegistry::Global());
+  {
+    // Scope for temp variables used to construct g.
+    GraphDefBuilder b(GraphDefBuilder::kFailImmediately);
+    Node* input = ops::SourceOp("TestGPUOutput", b.opts().WithName("in"));
+    ops::UnaryOp("TestDatasetOp", ops::NodeOut(input, 0),
+                 b.opts().WithName("n1"));
+    TF_EXPECT_OK(BuildGraph(b, &g));
+  }
+
+  TF_EXPECT_OK(Place(&g));
+  EXPECT_DEVICE_TYPE(g, "in", "FakeGPU");
+  EXPECT_DEVICE_TYPE(g, "n1", "FakeCPU");
+}
+
+// Tests that a GPU kernel colocated with prioritized kernel respects it.
+TEST_F(PlacerTest, TestGPUInputColocatedWithPrioritizedKernel) {
+  Graph g(OpRegistry::Global());
+  {
+    // Scope for temp variables used to construct g.
+    GraphDefBuilder b(GraphDefBuilder::kFailImmediately);
+    Node* input = ops::SourceOp("TestGPUOutput", b.opts().WithName("in"));
+    // We colocate n1 with in.
+    ops::UnaryOp("TestDatasetOp", ops::NodeOut(input, 0),
+                 b.opts().WithName("n1").WithAttr("_class", {"loc:@in"}));
+    // We don't colocate n2 with in.
+    ops::UnaryOp("TestDatasetOp", ops::NodeOut(input, 0),
+                 b.opts().WithName("n2"));
+    TF_EXPECT_OK(BuildGraph(b, &g));
+  }
+
+  TF_EXPECT_OK(Place(&g));
+  EXPECT_DEVICE_TYPE(g, "in", "FakeGPU");
+  EXPECT_DEVICE_TYPE(g, "n1", "FakeGPU");
+  EXPECT_DEVICE_TYPE(g, "n2", "FakeCPU");
+}
+
+REGISTER_OP("CreateDatasetCPU").Output("o: resource");
+REGISTER_KERNEL_BUILDER(Name("CreateDatasetCPU").Device("FakeCPU"), DummyOp);
+
+REGISTER_OP("CreateDatasetSP").Output("o: resource");
+REGISTER_KERNEL_BUILDER(Name("CreateDatasetSP").Device("FakeCPU").Priority(2),
+                        DummyOp);
+REGISTER_KERNEL_BUILDER(Name("CreateDatasetSP").Device("FakeGPU").Priority(1),
+                        DummyOp);
+
+REGISTER_OP("CreateDatasetRP").Output("o: resource");
+REGISTER_KERNEL_BUILDER(Name("CreateDatasetRP").Device("FakeCPU").Priority(1),
+                        DummyOp);
+REGISTER_KERNEL_BUILDER(Name("CreateDatasetRP").Device("FakeGPU").Priority(2),
+                        DummyOp);
+
+REGISTER_OP("CreateDatasetNP").Output("o: resource");
+REGISTER_KERNEL_BUILDER(Name("CreateDatasetNP").Device("FakeCPU"), DummyOp);
+REGISTER_KERNEL_BUILDER(Name("CreateDatasetNP").Device("FakeGPU"), DummyOp);
+
+REGISTER_OP("IteratorNP").Input("i: resource").Output("o: float");
+REGISTER_KERNEL_BUILDER(Name("IteratorNP").Device("FakeCPU"), DummyOp);
+REGISTER_KERNEL_BUILDER(Name("IteratorNP").Device("FakeGPU"), DummyOp);
+
+REGISTER_OP("IteratorSP").Input("i: resource").Output("o: float");
+REGISTER_KERNEL_BUILDER(Name("IteratorSP").Device("FakeCPU").Priority(2),
+                        DummyOp);
+REGISTER_KERNEL_BUILDER(Name("IteratorSP").Device("FakeGPU").Priority(1),
+                        DummyOp);
+
+REGISTER_OP("IteratorRP").Input("i: resource").Output("o: float");
+REGISTER_KERNEL_BUILDER(Name("IteratorRP").Device("FakeCPU").Priority(1),
+                        DummyOp);
+REGISTER_KERNEL_BUILDER(Name("IteratorRP").Device("FakeGPU").Priority(2),
+                        DummyOp);
+
+REGISTER_OP("IteratorGPU").Input("i: resource").Output("o: float");
+REGISTER_KERNEL_BUILDER(Name("IteratorGPU").Device("FakeGPU"), DummyOp);
+
+// Test reference edges with one node having prioritized kernels and the other
+// has no preference. We should respect priority here.
+TEST_F(PlacerTest, TestDSWithPriority) {
+  Graph g(OpRegistry::Global());
+  {
+    GraphDefBuilder b(GraphDefBuilder::kFailImmediately);
+    Node* ds = ops::SourceOp("CreateDatasetSP", b.opts().WithName("ds"));
+    ops::UnaryOp("IteratorNP", ops::NodeOut(ds, 0), b.opts().WithName("it"));
+    TF_EXPECT_OK(BuildGraph(b, &g));
+  }
+  TF_EXPECT_OK(Place(&g));
+  EXPECT_DEVICE_TYPE(g, "ds", "FakeCPU");
+  EXPECT_DEVICE_TYPE(g, "it", "FakeCPU");
+}
+
+// Test reference edges with one node having kernels with regular priority and
+// the other has no preference. We should respect priority here.
+TEST_F(PlacerTest, TestDSWithGPUPriority) {
+  Graph g(OpRegistry::Global());
+  {
+    GraphDefBuilder b(GraphDefBuilder::kFailImmediately);
+    Node* ds = ops::SourceOp("CreateDatasetRP", b.opts().WithName("ds"));
+    ops::UnaryOp("IteratorNP", ops::NodeOut(ds, 0), b.opts().WithName("it"));
+    TF_EXPECT_OK(BuildGraph(b, &g));
+  }
+  TF_EXPECT_OK(Place(&g));
+  EXPECT_DEVICE_TYPE(g, "ds", "FakeGPU");
+  EXPECT_DEVICE_TYPE(g, "it", "FakeGPU");
+}
+
+// Test reference edges with one node having prioritized kernels and the other
+// has no preference. We should respect priority here.
+TEST_F(PlacerTest, TestITWithPriority) {
+  Graph g(OpRegistry::Global());
+  {
+    GraphDefBuilder b(GraphDefBuilder::kFailImmediately);
+    Node* ds = ops::SourceOp("CreateDatasetNP", b.opts().WithName("ds"));
+    ops::UnaryOp("IteratorSP", ops::NodeOut(ds, 0), b.opts().WithName("it"));
+    TF_EXPECT_OK(BuildGraph(b, &g));
+  }
+  TF_EXPECT_OK(Place(&g));
+  EXPECT_DEVICE_TYPE(g, "ds", "FakeCPU");
+  EXPECT_DEVICE_TYPE(g, "it", "FakeCPU");
+}
+
+// Test reference edges with one node having kernels with regular priority and
+// the other has no preference. We should respect priority here.
+TEST_F(PlacerTest, TestITWithGPUPriority) {
+  Graph g(OpRegistry::Global());
+  {
+    GraphDefBuilder b(GraphDefBuilder::kFailImmediately);
+    Node* ds = ops::SourceOp("CreateDatasetNP", b.opts().WithName("ds"));
+    ops::UnaryOp("IteratorRP", ops::NodeOut(ds, 0), b.opts().WithName("it"));
+    TF_EXPECT_OK(BuildGraph(b, &g));
+  }
+  TF_EXPECT_OK(Place(&g));
+  EXPECT_DEVICE_TYPE(g, "ds", "FakeGPU");
+  EXPECT_DEVICE_TYPE(g, "it", "FakeGPU");
+}
+
+// Test reference edges with one node having prioritized kernels and other node
+// can only be placed on GPU. We should respect the constraint then.
+TEST_F(PlacerTest, TestITGPU) {
+  Graph g(OpRegistry::Global());
+  {
+    GraphDefBuilder b(GraphDefBuilder::kFailImmediately);
+    Node* ds = ops::SourceOp("CreateDatasetSP", b.opts().WithName("ds"));
+    ops::UnaryOp("IteratorGPU", ops::NodeOut(ds, 0), b.opts().WithName("it"));
+    TF_EXPECT_OK(BuildGraph(b, &g));
+  }
+  TF_EXPECT_OK(Place(&g));
+  EXPECT_DEVICE_TYPE(g, "ds", "FakeGPU");
+  EXPECT_DEVICE_TYPE(g, "it", "FakeGPU");
+}
+
+// Test reference edges with one node having prioritized kernels and other node
+// can only be placed on CPU. We should respect the constraint then.
+TEST_F(PlacerTest, TestSimpleIteratorOnlyGPU) {
+  Graph g(OpRegistry::Global());
+  {
+    GraphDefBuilder b(GraphDefBuilder::kFailImmediately);
+    Node* ds = ops::SourceOp("CreateDatasetCPU", b.opts().WithName("ds"));
+    ops::UnaryOp("IteratorRP", ops::NodeOut(ds, 0), b.opts().WithName("it"));
+    TF_EXPECT_OK(BuildGraph(b, &g));
+  }
+  TF_EXPECT_OK(Place(&g));
+  EXPECT_DEVICE_TYPE(g, "ds", "FakeCPU");
+  EXPECT_DEVICE_TYPE(g, "it", "FakeCPU");
+}
+
+// Test constraints with agreeing priorities.
+TEST_F(PlacerTest, TestAgreeingPriorities) {
+  Graph g(OpRegistry::Global());
+  {
+    GraphDefBuilder b(GraphDefBuilder::kFailImmediately);
+    Node* ds = ops::SourceOp("CreateDatasetSP", b.opts().WithName("ds"));
+    ops::UnaryOp("IteratorSP", ops::NodeOut(ds, 0), b.opts().WithName("it"));
+    TF_EXPECT_OK(BuildGraph(b, &g));
+  }
+  TF_EXPECT_OK(Place(&g));
+  EXPECT_DEVICE_TYPE(g, "ds", "FakeCPU");
+  EXPECT_DEVICE_TYPE(g, "it", "FakeCPU");
+}
+
+// Test constraints with agreeing regular priorities.
+TEST_F(PlacerTest, TestAgreeingRegularPriorities) {
+  Graph g(OpRegistry::Global());
+  {
+    GraphDefBuilder b(GraphDefBuilder::kFailImmediately);
+    Node* ds = ops::SourceOp("CreateDatasetRP", b.opts().WithName("ds"));
+    ops::UnaryOp("IteratorRP", ops::NodeOut(ds, 0), b.opts().WithName("it"));
+    TF_EXPECT_OK(BuildGraph(b, &g));
+  }
+  TF_EXPECT_OK(Place(&g));
+  EXPECT_DEVICE_TYPE(g, "ds", "FakeGPU");
+  EXPECT_DEVICE_TYPE(g, "it", "FakeGPU");
+}
+
+// Test constraints with different priorities. In this case, we should bail
+// and just revert to default.
+TEST_F(PlacerTest, TestConflictingPriorities) {
+  Graph g(OpRegistry::Global());
+  {
+    GraphDefBuilder b(GraphDefBuilder::kFailImmediately);
+    Node* ds = ops::SourceOp("CreateDatasetSP", b.opts().WithName("ds"));
+    ops::UnaryOp("IteratorRP", ops::NodeOut(ds, 0), b.opts().WithName("it"));
+    TF_EXPECT_OK(BuildGraph(b, &g));
+  }
+  TF_EXPECT_OK(Place(&g));
+  EXPECT_DEVICE_TYPE(g, "ds", "FakeGPU");
+  EXPECT_DEVICE_TYPE(g, "it", "FakeGPU");
+}
+
+// Test constraints with different priorities. In this case, we should bail
+// and just revert to default.
+TEST_F(PlacerTest, TestConflictingPrioritiesReversed) {
+  Graph g(OpRegistry::Global());
+  {
+    GraphDefBuilder b(GraphDefBuilder::kFailImmediately);
+    Node* ds = ops::SourceOp("CreateDatasetRP", b.opts().WithName("ds"));
+    ops::UnaryOp("IteratorSP", ops::NodeOut(ds, 0), b.opts().WithName("it"));
+    TF_EXPECT_OK(BuildGraph(b, &g));
+  }
+  TF_EXPECT_OK(Place(&g));
+  EXPECT_DEVICE_TYPE(g, "ds", "FakeGPU");
+  EXPECT_DEVICE_TYPE(g, "it", "FakeGPU");
+}
+
 // Test that a graph with device type and reference constraints on
 // some of the ops will successfully assign nodes to the constrained
 // device, and colocate nodes with reference connections.
@@ -1199,10 +1451,33 @@ TEST_F(PlacerTest, TestNonExistentDevice) {
   EXPECT_EQ(error::INVALID_ARGUMENT, s.code());
   LOG(WARNING) << s.error_message();
   EXPECT_TRUE(str_util::StrContains(
-      s.error_message(),
-      "was explicitly assigned to /job:foo/replica:17 but available devices"));
+      s.error_message(), "was explicitly assigned to /job:foo/replica:17"));
+  EXPECT_TRUE(
+      str_util::StrContains(s.error_message(), "but available devices"));
 }
 
+#if !GOOGLE_CUDA
+// Test that we inform the user if they appear to be explicitly placing nodes
+// on a GPU when CUDA is not available
+TEST_F(PlacerTest, TestUseGpuWithNoCuda) {
+  Graph g(OpRegistry::Global());
+  {  // Scope for temporary variables used to construct g.
+    GraphDefBuilder b(GraphDefBuilder::kFailImmediately);
+    ops::SourceOp("VariableGPU",
+                  b.opts().WithName("var").WithDevice("/device:gpu:0"));
+    TF_EXPECT_OK(BuildGraph(b, &g));
+  }
+
+  SessionOptions options;
+  Status s = Place(&g, &options);
+  EXPECT_EQ(error::INVALID_ARGUMENT, s.code());
+  LOG(WARNING) << s.error_message();
+  EXPECT_TRUE(str_util::StrContains(
+      s.error_message(),
+      "The requested device appears to be a GPU, but CUDA is not enabled."));
+}
+#endif
+
 TEST_F(PlacerTest, TestUnsupportedDeviceAllowSoftPlacement) {
   Graph g(OpRegistry::Global());
   {  // Scope for temporary variables used to construct g.
diff --git a/tensorflow/core/common_runtime/session_options.cc b/tensorflow/core/common_runtime/session_options.cc
index aacd57000cf..57c3b605575 100644
--- a/tensorflow/core/common_runtime/session_options.cc
+++ b/tensorflow/core/common_runtime/session_options.cc
@@ -1,4 +1,4 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/tensorflow/core/common_runtime/step_stats_collector.cc b/tensorflow/core/common_runtime/step_stats_collector.cc
index a70ab93d4ad..49265445659 100644
--- a/tensorflow/core/common_runtime/step_stats_collector.cc
+++ b/tensorflow/core/common_runtime/step_stats_collector.cc
@@ -139,7 +139,7 @@ void NodeExecStatsWrapper::SetScheduled(int64 nanos) {
 }
 
 void NodeExecStatsWrapper::SetMemory(OpKernelContext* ctx) {
-  for (const auto& allocator_pair : ctx->wrapped_allocators()) {
+  for (const auto& allocator_pair : ctx->ConsumeWrappedAllocators()) {
     AddAllocation(allocator_pair.first, allocator_pair.second);
   }
   auto* ms = stats_->mutable_memory_stats();
diff --git a/tensorflow/core/common_runtime/step_stats_collector.h b/tensorflow/core/common_runtime/step_stats_collector.h
index 4a1bb44f4b8..7d34383ce82 100644
--- a/tensorflow/core/common_runtime/step_stats_collector.h
+++ b/tensorflow/core/common_runtime/step_stats_collector.h
@@ -74,8 +74,7 @@ class NodeExecStatsInterface {
   // Records information about the memory allocated during the execution of this
   // node.
   //
-  // Takes ownership of the `TrackingAllocator` objects in
-  // `ctx->wrapped_allocators()`.
+  // Takes ownership of any `TrackingAllocator` objects stored in `ctx`.
   virtual void SetMemory(OpKernelContext* ctx) = 0;
 
   // Records information about the tensor produced by this node at the given
diff --git a/tensorflow/core/distributed_runtime/BUILD b/tensorflow/core/distributed_runtime/BUILD
index 37029f3f1a7..9f091224348 100644
--- a/tensorflow/core/distributed_runtime/BUILD
+++ b/tensorflow/core/distributed_runtime/BUILD
@@ -15,7 +15,7 @@ filegroup(
     ]),
 )
 
-load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+load("//tensorflow:tensorflow.bzl", "tf_cc_test", "tf_cuda_library")
 load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_test")
 load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_tests")
 load("//tensorflow:tensorflow.bzl", "tf_copts")
@@ -189,7 +189,7 @@ cc_library(
     ],
 )
 
-cc_library(
+tf_cuda_library(
     name = "worker",
     srcs = ["worker.cc"],
     hdrs = [
@@ -204,6 +204,7 @@ cc_library(
         ":worker_interface",
         ":worker_session",
         "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:device_tracer",
         "//tensorflow/core:lib_internal",
     ],
 )
@@ -424,9 +425,11 @@ cc_library(
         "//tensorflow/core:graph",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "//tensorflow/core:metrics",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:worker_proto_cc",
         "//tensorflow/core/debug",
+        "@com_google_absl//absl/time",
     ],
 )
 
diff --git a/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc b/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc
index 5c9b33b345b..5ba522c2a2e 100644
--- a/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc
+++ b/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc
@@ -345,8 +345,7 @@ TEST_F(EagerServiceImplTest, SendTensorTest) {
       response.context_id(), RemoteTensorHandleInternal(2, 0), &tensor_handle));
   TF_ASSERT_OK(tensor_handle->Tensor(&t));
 
-  Device* device = nullptr;
-  TF_ASSERT_OK(tensor_handle->Device(&device));
+  Device* device = tensor_handle->device();
   EXPECT_NE(device, nullptr);
   EXPECT_EQ(device->name(), "/job:localhost/replica:0/task:0/device:CPU:0");
 
diff --git a/tensorflow/core/distributed_runtime/graph_mgr.cc b/tensorflow/core/distributed_runtime/graph_mgr.cc
index 3944668028a..ee39062e345 100644
--- a/tensorflow/core/distributed_runtime/graph_mgr.cc
+++ b/tensorflow/core/distributed_runtime/graph_mgr.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <vector>
 
+#include "absl/time/clock.h"
 #include "tensorflow/core/common_runtime/build_graph_options.h"
 #include "tensorflow/core/common_runtime/constant_folding.h"
 #include "tensorflow/core/common_runtime/debugger_state_interface.h"
@@ -25,6 +26,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/common_runtime/graph_optimizer.h"
 #include "tensorflow/core/common_runtime/memory_types.h"
+#include "tensorflow/core/common_runtime/metrics.h"
 #include "tensorflow/core/common_runtime/optimization_registry.h"
 #include "tensorflow/core/common_runtime/process_util.h"
 #include "tensorflow/core/common_runtime/rendezvous_util.h"
@@ -386,6 +388,7 @@ void GraphMgr::ExecuteAsync(const string& handle, const int64 step_id,
                             MutableRunGraphResponseWrapper* response,
                             CancellationManager* cancellation_manager,
                             const NamedTensors& in, StatusCallback done) {
+  const absl::Time start_time = absl::Now();
   // Lookup an item. Holds one ref while executing.
   Item* item = nullptr;
   {
@@ -443,14 +446,16 @@ void GraphMgr::ExecuteAsync(const string& handle, const int64 step_id,
     return;
   }
 
-  StartParallelExecutors(handle, step_id, item, rendezvous, ce_handle,
-                         collector, cost_graph, cancellation_manager,
-                         [item, rendezvous, ce_handle, done](const Status& s) {
-                           done(s);
-                           rendezvous->Unref();
-                           item->Unref();
-                           delete ce_handle;
-                         });
+  StartParallelExecutors(
+      handle, step_id, item, rendezvous, ce_handle, collector, cost_graph,
+      cancellation_manager,
+      [item, rendezvous, ce_handle, done, start_time](const Status& s) {
+        done(s);
+        UpdateGraphExecutionTime(absl::Now() - start_time);
+        rendezvous->Unref();
+        item->Unref();
+        delete ce_handle;
+      });
 }
 
 void GraphMgr::StartParallelExecutors(const string& handle, int64 step_id,
diff --git a/tensorflow/core/distributed_runtime/master_session.cc b/tensorflow/core/distributed_runtime/master_session.cc
index 8e9eec1ed92..bc8ba6e47d5 100644
--- a/tensorflow/core/distributed_runtime/master_session.cc
+++ b/tensorflow/core/distributed_runtime/master_session.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/distributed_runtime/master_session.h"
 
+#include <memory>
 #include <unordered_map>
 #include <unordered_set>
 #include <vector>
@@ -64,27 +65,33 @@ namespace tensorflow {
 class MasterSession::ReffedClientGraph : public core::RefCounted {
  public:
   ReffedClientGraph(const string& handle, const BuildGraphOptions& bopts,
-                    std::unique_ptr<ClientGraph> cg,
+                    std::unique_ptr<ClientGraph> client_graph,
                     const SessionOptions& session_opts,
                     const StatsPublisherFactory& stats_publisher_factory,
                     bool is_partial, WorkerCacheInterface* worker_cache,
                     bool should_deregister)
       : session_handle_(handle),
         bg_opts_(bopts),
-        client_graph_(std::move(cg)),
+        client_graph_before_register_(std::move(client_graph)),
         session_opts_(session_opts),
         is_partial_(is_partial),
         callable_opts_(bopts.callable_options),
         worker_cache_(worker_cache),
-        should_deregister_(should_deregister) {
+        should_deregister_(should_deregister),
+        collective_graph_key_(
+            client_graph_before_register_->collective_graph_key) {
     VLOG(1) << "Created ReffedClientGraph for node with "
-            << client_graph()->graph.num_node_ids();
+            << client_graph_before_register_->graph.num_node_ids();
 
     stats_publisher_ = stats_publisher_factory(handle, bopts, session_opts);
 
     // Initialize a name to node map for processing device stats.
-    for (Node* n : client_graph_->graph.nodes()) {
-      name_to_node_.insert({n->name(), n});
+    for (Node* n : client_graph_before_register_->graph.nodes()) {
+      name_to_node_details_.emplace(
+          n->name(),
+          NodeDetails(n->type_string(),
+                      strings::StrCat(
+                          "(", str_util::Join(n->requested_inputs(), ", "))));
     }
   }
 
@@ -98,12 +105,12 @@ class MasterSession::ReffedClientGraph : public core::RefCounted {
     }
   }
 
-  const ClientGraph* client_graph() { return client_graph_.get(); }
-
   const CallableOptions& callable_options() { return callable_opts_; }
 
   const BuildGraphOptions& build_graph_options() { return bg_opts_; }
 
+  int64 collective_graph_key() { return collective_graph_key_; }
+
   std::unique_ptr<ProfileHandler> GetProfileHandler(uint64 step,
                                                     int64 execution_count,
                                                     const RunOptions& ropts) {
@@ -187,7 +194,7 @@ class MasterSession::ReffedClientGraph : public core::RefCounted {
 
   // Partitions the graph into subgraphs and registers them on
   // workers.
-  Status RegisterPartitions(const PartitionOptions& popts);
+  Status RegisterPartitions(PartitionOptions popts);
 
   // Runs one step of all partitions.
   Status RunPartitions(const MasterEnv* env, int64 step_id,
@@ -214,29 +221,28 @@ class MasterSession::ReffedClientGraph : public core::RefCounted {
                       const RunState* run_state,
                       GraphExecutionState* execution_state);
 
-  string DetailText(const Node& node, const NodeExecStats& ns) {
-    int64 tot = 0;
-    for (auto& no : ns.output()) {
-      tot += no.tensor_description().allocation_description().requested_bytes();
-    }
-    string bytes;
-    if (tot >= 0.1 * 1048576.0) {
-      bytes = strings::Printf("[%.1fMB] ", tot / 1048576.0);
-    }
-    return strings::StrCat(bytes, node.name(), " = ", node.type_string(), "(",
-                           str_util::Join(node.requested_inputs(), ", "), ")");
-  }
-
  private:
   const string session_handle_;
   const BuildGraphOptions bg_opts_;
-  const std::unique_ptr<ClientGraph> client_graph_;
+
+  // NOTE(mrry): This pointer will be null after `RegisterPartitions()` returns.
+  std::unique_ptr<ClientGraph> client_graph_before_register_ GUARDED_BY(mu_);
   const SessionOptions session_opts_;
   const bool is_partial_;
   const CallableOptions callable_opts_;
   WorkerCacheInterface* const worker_cache_;  // Not owned.
-  std::unordered_map<StringPiece, Node*, StringPieceHasher> name_to_node_;
+
+  struct NodeDetails {
+    explicit NodeDetails(string type_string, string detail_text)
+        : type_string(std::move(type_string)),
+          detail_text(std::move(detail_text)) {}
+    const string type_string;
+    const string detail_text;
+  };
+  std::unordered_map<string, NodeDetails> name_to_node_details_;
+
   const bool should_deregister_;
+  const int64 collective_graph_key_;
   std::atomic<int64> execution_count_ = {0};
 
   // Graph partitioned into per-location subgraphs.
@@ -268,9 +274,8 @@ class MasterSession::ReffedClientGraph : public core::RefCounted {
   mutable mutex mu_;
 
   // Partition initialization and registration only needs to happen
-  // once. init_started_ && !init_done_ indicates the initialization
-  // is on going.
-  bool init_started_ GUARDED_BY(mu_) = false;
+  // once. `!client_graph_before_register_ && !init_done_.HasBeenNotified()`
+  // indicates the initialization is ongoing.
   Notification init_done_;
 
   // init_result_ remembers the initialization error if any.
@@ -278,6 +283,19 @@ class MasterSession::ReffedClientGraph : public core::RefCounted {
 
   std::unique_ptr<StatsPublisherInterface> stats_publisher_;
 
+  string DetailText(const NodeDetails& details, const NodeExecStats& stats) {
+    int64 tot = 0;
+    for (auto& no : stats.output()) {
+      tot += no.tensor_description().allocation_description().requested_bytes();
+    }
+    string bytes;
+    if (tot >= 0.1 * 1048576.0) {
+      bytes = strings::Printf("[%.1fMB] ", tot / 1048576.0);
+    }
+    return strings::StrCat(bytes, stats.node_name(), " = ",
+                           details.type_string, details.detail_text);
+  }
+
   // Send/Recv nodes that are the result of client-added
   // feeds and fetches must be tracked so that the tensors
   // can be added to the local rendezvous.
@@ -286,7 +304,7 @@ class MasterSession::ReffedClientGraph : public core::RefCounted {
 
   // The actual graph partitioning and registration implementation.
   Status DoBuildPartitions(
-      PartitionOptions pots,
+      PartitionOptions popts, ClientGraph* client_graph,
       std::unordered_map<string, GraphDef>* out_partitions);
   Status DoRegisterPartitions(
       const PartitionOptions& popts,
@@ -311,14 +329,20 @@ class MasterSession::ReffedClientGraph : public core::RefCounted {
 };
 
 Status MasterSession::ReffedClientGraph::RegisterPartitions(
-    const PartitionOptions& popts) {
+    PartitionOptions popts) {
   {  // Ensure register once.
     mu_.lock();
-    if (!init_started_) {
-      init_started_ = true;
+    if (client_graph_before_register_) {
+      // The `ClientGraph` is no longer needed after partitions are registered.
+      // Since it can account for a large amount of memory, we consume it here,
+      // and it will be freed after concluding with registration.
+
+      std::unique_ptr<ClientGraph> client_graph;
+      std::swap(client_graph_before_register_, client_graph);
       mu_.unlock();
       std::unordered_map<string, GraphDef> graph_defs;
-      Status s = DoBuildPartitions(popts, &graph_defs);
+      popts.flib_def = client_graph->flib_def.get();
+      Status s = DoBuildPartitions(popts, client_graph.get(), &graph_defs);
       if (s.ok()) {
         // NOTE(mrry): The pointers in `graph_defs_for_publishing` do not remain
         // valid after the call to DoRegisterPartitions begins, so
@@ -394,19 +418,19 @@ void MasterSession::ReffedClientGraph::TrackFeedsAndFetches(
 }
 
 Status MasterSession::ReffedClientGraph::DoBuildPartitions(
-    PartitionOptions popts,
+    PartitionOptions popts, ClientGraph* client_graph,
     std::unordered_map<string, GraphDef>* out_partitions) {
   if (popts.need_to_record_start_times) {
     CostModel cost_model(true);
-    cost_model.InitFromGraph(client_graph()->graph);
+    cost_model.InitFromGraph(client_graph->graph);
     // TODO(yuanbyu): Use the real cost model.
     // execution_state_->MergeFromGlobal(&cost_model);
-    SlackAnalysis sa(&client_graph()->graph, &cost_model);
+    SlackAnalysis sa(&client_graph->graph, &cost_model);
     sa.ComputeAsap(&popts.start_times);
   }
 
   // Partition the graph.
-  return Partition(popts, &client_graph_->graph, out_partitions);
+  return Partition(popts, &client_graph->graph, out_partitions);
 }
 
 Status MasterSession::ReffedClientGraph::DoRegisterPartitions(
@@ -415,7 +439,7 @@ Status MasterSession::ReffedClientGraph::DoRegisterPartitions(
   partitions_.reserve(graph_partitions.size());
   Status s;
   for (auto& name_def : graph_partitions) {
-    partitions_.resize(partitions_.size() + 1);
+    partitions_.emplace_back();
     Part* part = &partitions_.back();
     part->name = name_def.first;
     TrackFeedsAndFetches(part, name_def.second, popts);
@@ -449,7 +473,7 @@ Status MasterSession::ReffedClientGraph::DoRegisterPartitions(
     *c->req.mutable_graph_options() = session_opts_.config.graph_options();
     *c->req.mutable_debug_options() =
         callable_opts_.run_options().debug_options();
-    c->req.set_collective_graph_key(client_graph()->collective_graph_key);
+    c->req.set_collective_graph_key(collective_graph_key_);
     VLOG(2) << "Register " << c->req.graph_def().DebugString();
     auto cb = [c, &done](const Status& s) {
       c->status = s;
@@ -915,8 +939,8 @@ void MasterSession::ReffedClientGraph::ProcessDeviceStats(
       ph->RecordOneOp(dev_name, ns, true /*is_copy*/, "", ns.node_name(),
                       ns.timeline_label());
     } else {
-      const Node* node = name_to_node_[ns.node_name()];
-      const bool found_node_in_graph = node != nullptr;
+      auto iter = name_to_node_details_.find(ns.node_name());
+      const bool found_node_in_graph = iter != name_to_node_details_.end();
       if (!found_node_in_graph && ns.timeline_label().empty()) {
         // The counter incrementing is not thread-safe. But we don't really
         // care.
@@ -930,13 +954,13 @@ void MasterSession::ReffedClientGraph::ProcessDeviceStats(
         }
         continue;
       }
-      string optype =
-          found_node_in_graph ? node->type_string() : ns.node_name();
+      const string& optype =
+          found_node_in_graph ? iter->second.type_string : ns.node_name();
       string details;
       if (!ns.timeline_label().empty()) {
         details = ns.timeline_label();
       } else if (found_node_in_graph) {
-        details = DetailText(*node, ns);
+        details = DetailText(iter->second, ns);
       } else {
         // Leave details string empty
       }
@@ -1545,14 +1569,13 @@ Status MasterSession::BuildAndRegisterPartitions(ReffedClientGraph* rcg) {
   // Registers subgraphs if haven't done so.
   PartitionOptions popts;
   popts.node_to_loc = SplitByWorker;
-  // The closures potps.{new_name,get_incarnation} are called synchronously in
+  // The closures popts.{new_name,get_incarnation} are called synchronously in
   // RegisterPartitions() below, so do not need a Ref()/Unref() pair to keep
   // "this" alive during the closure.
   popts.new_name = [this](const string& prefix) {
     mutex_lock l(mu_);
     return strings::StrCat(prefix, "_S", next_node_id_++);
   };
-  popts.flib_def = rcg->client_graph()->flib_def.get();
   popts.get_incarnation = [this](const string& name) -> int64 {
     Device* d = devices_->FindDeviceByName(name);
     if (d == nullptr) {
@@ -1580,7 +1603,7 @@ Status MasterSession::BuildAndRegisterPartitions(ReffedClientGraph* rcg) {
     popts.need_to_record_start_times = true;
   }
 
-  TF_RETURN_IF_ERROR(rcg->RegisterPartitions(popts));
+  TF_RETURN_IF_ERROR(rcg->RegisterPartitions(std::move(popts)));
 
   return Status::OK();
 }
@@ -1784,10 +1807,10 @@ Status MasterSession::PostRunCleanup(MasterSession::ReffedClientGraph* rcg,
   Status s = run_status;
   if (s.ok()) {
     pss->end_micros = Env::Default()->NowMicros();
-    if (rcg->client_graph()->collective_graph_key !=
+    if (rcg->collective_graph_key() !=
         BuildGraphOptions::kNoCollectiveGraphKey) {
-      env_->collective_executor_mgr->RetireStepId(
-          rcg->client_graph()->collective_graph_key, step_id);
+      env_->collective_executor_mgr->RetireStepId(rcg->collective_graph_key(),
+                                                  step_id);
     }
     // Schedule post-processing and cleanup to be done asynchronously.
     rcg->ProcessStats(step_id, pss, ph.get(), run_options, out_run_metadata);
@@ -1846,7 +1869,7 @@ Status MasterSession::DoRunWithLocalExecution(
 
   // Keeps the highest 8 bits 0x01: we reserve some bits of the
   // step_id for future use.
-  uint64 step_id = NewStepId(rcg->client_graph()->collective_graph_key);
+  uint64 step_id = NewStepId(rcg->collective_graph_key());
   TRACEPRINTF("stepid %llu", step_id);
 
   std::unique_ptr<ProfileHandler> ph;
@@ -1854,6 +1877,7 @@ Status MasterSession::DoRunWithLocalExecution(
 
   Status s = rcg->RunPartitions(env_, step_id, count, &pss, opts, req, resp,
                                 &cancellation_manager_, false);
+
   cleanup.release();  // MarkRunCompletion called in PostRunCleanup().
   return PostRunCleanup(rcg, step_id, req.options(), &pss, ph, s,
                         resp->mutable_metadata());
@@ -1910,7 +1934,7 @@ Status MasterSession::DoRunCallable(CallOptions* opts, ReffedClientGraph* rcg,
   // Prepare.
   int64 count = rcg->get_and_increment_execution_count();
 
-  const uint64 step_id = NewStepId(rcg->client_graph()->collective_graph_key);
+  const uint64 step_id = NewStepId(rcg->collective_graph_key());
   TRACEPRINTF("stepid %llu", step_id);
 
   const RunOptions& run_options = rcg->callable_options().run_options();
diff --git a/tensorflow/core/distributed_runtime/rpc/BUILD b/tensorflow/core/distributed_runtime/rpc/BUILD
index 4a10d99a607..d122016d3ee 100644
--- a/tensorflow/core/distributed_runtime/rpc/BUILD
+++ b/tensorflow/core/distributed_runtime/rpc/BUILD
@@ -87,6 +87,7 @@ cc_library(
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:worker_proto_cc",
         "//tensorflow/core/distributed_runtime:tensor_coding",
         "//tensorflow/core/distributed_runtime:worker_cache_logger",
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc b/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc
index 6008462d044..885c5e87c17 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc
@@ -33,6 +33,7 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/tracing.h"
+#include "tensorflow/core/protobuf/transport_options.pb.h"
 #include "tensorflow/core/protobuf/worker.pb.h"
 
 namespace tensorflow {
@@ -121,7 +122,44 @@ class GrpcRemoteWorker : public WorkerInterface {
 
   void RecvBufAsync(CallOptions* call_opts, const RecvBufRequest* request,
                     RecvBufResponse* response, StatusCallback done) override {
-    IssueRequest(request, response, recvbuf_, std::move(done), call_opts);
+    int64 start_usec = Env::Default()->NowMicros();
+    // Type-specialized logging for this method.
+    bool logging_active = logger_->LoggingActive() || VLOG_IS_ON(2);
+    StatusCallback wrapper_done;
+    const StatusCallback* cb_to_use;
+    if (!logging_active) {
+      cb_to_use = &done;  // No additional work to do, so just use done directly
+    } else {
+      wrapper_done = [this, request, response, done, start_usec](Status s) {
+        if (logger_->LoggingActive()) {
+          int64 end_usec = Env::Default()->NowMicros();
+          int64 step_id = request->step_id();
+          RecvBufRespExtra extra;
+          response->transport_options().UnpackTo(&extra);
+          int64 num_bytes = 0;
+          for (const auto& chunk : extra.tensor_content()) {
+            num_bytes += chunk.size();
+          }
+          int64 send_start_usec = start_usec;
+          // Prefer start time reported by the sender, if available.
+          if (response->send_start_micros()) {
+            send_start_usec = std::max(
+                start_usec, static_cast<int64>(response->send_start_micros()));
+            send_start_usec = std::min(send_start_usec, end_usec - 1);
+          }
+          const string& key = request->buf_rendezvous_key();
+          logger_->RecordDataTransfer(
+              step_id, send_start_usec, end_usec, key, request->src_device(),
+              request->dst_device(), num_bytes, "", "RecvBuf");
+        }
+        VLOG(2) << "done callback, req: " << request->DebugString()
+                << " response " << response->DebugString();
+        done(s);
+      };
+      cb_to_use = &wrapper_done;
+    }
+
+    IssueRequest(request, response, recvbuf_, *cb_to_use, call_opts);
   }
 
   void CompleteGroupAsync(CallOptions* call_opts,
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
index 63d438c6155..ae722fdfe95 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
@@ -194,7 +194,7 @@ Status GrpcServer::Init(
   MaybeMutateBuilder(&builder);
   master_impl_ = CreateMaster(&master_env_);
   master_service_ = NewGrpcMasterService(master_impl_.get(), config, &builder);
-  worker_impl_ = worker_func ? worker_func(&worker_env_)
+  worker_impl_ = worker_func ? worker_func(&worker_env_, config)
                              : NewGrpcWorker(&worker_env_, config);
   worker_service_ =
       NewGrpcWorkerService(worker_impl_.get(), &builder).release();
@@ -451,7 +451,11 @@ Status GrpcServer::Create(const ServerDef& server_def, Env* env,
   std::unique_ptr<GrpcServer> ret(
       new GrpcServer(server_def, env == nullptr ? Env::Default() : env));
   ServiceInitFunction service_func = nullptr;
-  TF_RETURN_IF_ERROR(ret->Init(service_func, NewRpcRendezvousMgr, nullptr));
+  Status s = ret->Init(service_func, NewRpcRendezvousMgr, nullptr);
+  if (!s.ok()) {
+    LOG(ERROR) << s;
+    return s;
+  }
   *out_server = std::move(ret);
   return Status::OK();
 }
@@ -462,7 +466,11 @@ Status GrpcServer::Create(const ServerDef& server_def, Env* env,
   std::unique_ptr<GrpcServer> ret(
       new GrpcServer(server_def, env == nullptr ? Env::Default() : env));
   ServiceInitFunction service_func = nullptr;
-  TF_RETURN_IF_ERROR(ret->Init(service_func, NewRpcRendezvousMgr, nullptr));
+  Status s = ret->Init(service_func, NewRpcRendezvousMgr, nullptr);
+  if (!s.ok()) {
+    LOG(ERROR) << s;
+    return s;
+  }
   *out_server = std::move(ret);
   return Status::OK();
 }
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h
index 7979e96d3ed..c1395abddeb 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h
@@ -53,7 +53,8 @@ typedef std::function<void(const WorkerEnv*, ::grpc::ServerBuilder*)>
     ServiceInitFunction;
 
 // function that creates a grpc based worker implementation.
-typedef std::function<std::unique_ptr<GrpcWorker>(WorkerEnv*)>
+typedef std::function<std::unique_ptr<GrpcWorker>(WorkerEnv*,
+                                                  const ConfigProto& config)>
     WorkerCreationFunction;
 
 class GrpcServer : public ServerInterface {
diff --git a/tensorflow/core/distributed_runtime/worker.cc b/tensorflow/core/distributed_runtime/worker.cc
index 1ea19c48f09..079c09859f4 100644
--- a/tensorflow/core/distributed_runtime/worker.cc
+++ b/tensorflow/core/distributed_runtime/worker.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/core/distributed_runtime/rendezvous_mgr_interface.h"
 #include "tensorflow/core/distributed_runtime/tensor_coding.h"
 #include "tensorflow/core/distributed_runtime/worker_session.h"
+#include "tensorflow/core/platform/device_tracer.h"
 #include "tensorflow/core/platform/tracing.h"
 
 namespace tensorflow {
@@ -179,7 +180,28 @@ void Worker::DoRunGraph(CallOptions* opts, RunGraphRequestWrapper* request,
       request->exec_opts().record_timeline() ||
       request->exec_opts().record_costs()) {
     collector = new StepStatsCollector(response->mutable_step_stats());
-    // TODO(mrry,pbar): GPU tracing for distributed steps.
+  }
+  DeviceTracer* tracer = nullptr;
+  if (collector && request->exec_opts().record_timeline()) {
+    // If timeline was requested, assume we want hardware level tracing.
+    std::unique_ptr<DeviceTracer> trptr = CreateDeviceTracer();
+    if (trptr) {
+      tracer = trptr.release();
+      Status s = tracer->Start();
+      if (!s.ok()) {
+        delete tracer;
+        if (errors::IsUnavailable(s)) {
+          LOG(WARNING)
+              << "Hardware tracing unavailable, continuing without it. " << s;
+          tracer = nullptr;
+        } else {
+          delete collector;
+          delete out;
+          done(s);
+          return;
+        }
+      }
+    }
   }
   CancellationManager* cm = new CancellationManager;
   opts->SetCancelCallback([this, cm, step_id]() {
@@ -194,6 +216,7 @@ void Worker::DoRunGraph(CallOptions* opts, RunGraphRequestWrapper* request,
     opts->ClearCancelCallback();
     delete cm;
     delete collector;
+    delete tracer;
     delete out;
     done(errors::Aborted("Call was aborted"));
     return;
@@ -201,8 +224,8 @@ void Worker::DoRunGraph(CallOptions* opts, RunGraphRequestWrapper* request,
   session->graph_mgr->ExecuteAsync(
       request->graph_handle(), step_id, session.get(), request->exec_opts(),
       collector, response, cm, in,
-      [this, step_id, response, session, cm, out, token, collector, opts,
-       done](Status s) {
+      [this, step_id, response, session, cm, out, token, collector, tracer,
+       opts, done](Status s) {
         if (s.ok()) {
           s = session->graph_mgr->RecvOutputs(step_id, out);
         }
@@ -210,6 +233,15 @@ void Worker::DoRunGraph(CallOptions* opts, RunGraphRequestWrapper* request,
         cancellation_manager_.DeregisterCallback(token);
         delete cm;
 
+        if (tracer) {
+          Status tracer_status = tracer->Stop();
+          if (tracer_status.ok()) {
+            tracer_status = tracer->Collect(collector);
+          }
+          if (!tracer_status.ok()) {
+            LOG(ERROR) << "Bad status from tracer: " << tracer_status;
+          }
+        }
         if (s.ok()) {
           for (const auto& p : *out) {
             const string& key = p.first;
@@ -219,6 +251,7 @@ void Worker::DoRunGraph(CallOptions* opts, RunGraphRequestWrapper* request,
         }
         if (collector) collector->Finalize();
         delete collector;
+        delete tracer;
         delete out;
         done(s);
       });
diff --git a/tensorflow/core/distributed_runtime/worker_cache_logger.cc b/tensorflow/core/distributed_runtime/worker_cache_logger.cc
index 95ca3c3b4d1..e0a17340870 100644
--- a/tensorflow/core/distributed_runtime/worker_cache_logger.cc
+++ b/tensorflow/core/distributed_runtime/worker_cache_logger.cc
@@ -101,13 +101,18 @@ void WorkerCacheLogger::RecordDataTransfer(int64 step_id, int64 start_usecs,
                                            const string& transfer_method_name) {
   NodeExecStats* ns = new NodeExecStats;
   ns->set_node_name(transfer_method_name);
+  int64 elapsed_usecs = end_usecs - start_usecs;
   if (details.empty()) {
     auto byte_string = strings::StrCat("[", bytes, "B] ");
     if (bytes >= 0.1 * 1048576.0) {
       byte_string = strings::Printf("[%.1fMB] ", bytes / 1048576.0);
     }
-    auto label = strings::StrCat(byte_string, tensor_name, " from ", src_device,
-                                 " to ", dst_device);
+    float mbs_rate = (8.0 * static_cast<float>(bytes)) / elapsed_usecs;
+    auto rate_string = (mbs_rate >= 1000.0)
+                           ? strings::Printf("[%.1fGb/s] ", mbs_rate / 1000.0)
+                           : strings::Printf("[%fMb/s] ", mbs_rate);
+    auto label = strings::StrCat(byte_string, rate_string, tensor_name,
+                                 " from ", src_device, " to ", dst_device);
     ns->set_timeline_label(label);
   } else {
     ns->set_timeline_label(details);
@@ -115,13 +120,10 @@ void WorkerCacheLogger::RecordDataTransfer(int64 step_id, int64 start_usecs,
 
   ns->set_all_start_micros(start_usecs);
   ns->set_op_start_rel_micros(0);
-  int64 elapsed = end_usecs - start_usecs;
-  ns->set_op_end_rel_micros(elapsed);
-  ns->set_all_end_rel_micros(elapsed);
+  ns->set_op_end_rel_micros(elapsed_usecs);
+  ns->set_all_end_rel_micros(elapsed_usecs);
   NodeOutput* no = ns->add_output();
   no->set_slot(0);
-  // TODO(tucker): Maybe set the dimensions too, but then they'll
-  // need to be passed in.
   no->mutable_tensor_description()
       ->mutable_allocation_description()
       ->set_requested_bytes(bytes);
diff --git a/tensorflow/core/framework/common_shape_fns.cc b/tensorflow/core/framework/common_shape_fns.cc
index e934cbfb547..7f35390f90c 100644
--- a/tensorflow/core/framework/common_shape_fns.cc
+++ b/tensorflow/core/framework/common_shape_fns.cc
@@ -1059,7 +1059,7 @@ Status UnknownShape(shape_inference::InferenceContext* c) {
 template <typename T>
 Status ReductionShapeHelper(const Tensor* reduction_indices_t,
                             const int32 input_rank,
-                            std::set<int64>& true_indices) {
+                            std::set<int64>* true_indices) {
   auto reduction_indices = reduction_indices_t->flat<T>();
   for (int i = 0; i < reduction_indices_t->NumElements(); ++i) {
     const T reduction_index = reduction_indices(i);
@@ -1074,7 +1074,7 @@ Status ReductionShapeHelper(const Tensor* reduction_indices_t,
       wrapped_index += input_rank;
     }
 
-    true_indices.insert(wrapped_index);
+    true_indices->insert(wrapped_index);
   }
   return Status::OK();
 }
@@ -1112,10 +1112,10 @@ Status ReductionShape(InferenceContext* c) {
   std::set<int64> true_indices;
   if (reduction_indices_t->dtype() == DataType::DT_INT32) {
     TF_RETURN_IF_ERROR(ReductionShapeHelper<int32>(reduction_indices_t,
-                                                   input_rank, true_indices));
+                                                   input_rank, &true_indices));
   } else if (reduction_indices_t->dtype() == DataType::DT_INT64) {
     TF_RETURN_IF_ERROR(ReductionShapeHelper<int64>(reduction_indices_t,
-                                                   input_rank, true_indices));
+                                                   input_rank, &true_indices));
   } else {
     return errors::InvalidArgument(
         "reduction_indices can only be int32 or int64");
diff --git a/tensorflow/core/framework/dataset.cc b/tensorflow/core/framework/dataset.cc
index 6852b97e744..fc6b5dde0cb 100644
--- a/tensorflow/core/framework/dataset.cc
+++ b/tensorflow/core/framework/dataset.cc
@@ -205,7 +205,7 @@ bool GraphDefBuilderWrapper::HasAttr(const string& name,
 
 Status GetDatasetFromVariantTensor(const Tensor& tensor,
                                    DatasetBase** out_dataset) {
-  if (!(tensor.dtype() == DT_VARIANT ||
+  if (!(tensor.dtype() == DT_VARIANT &&
         TensorShapeUtils::IsScalar(tensor.shape()))) {
     return errors::InvalidArgument(
         "Dataset tensor must be a scalar of dtype DT_VARIANT.");
diff --git a/tensorflow/core/framework/dataset.h b/tensorflow/core/framework/dataset.h
index 55ba7d96b16..5960c105c84 100644
--- a/tensorflow/core/framework/dataset.h
+++ b/tensorflow/core/framework/dataset.h
@@ -272,56 +272,61 @@ class StatsAggregator;
 class IteratorContext {
  public:
   struct Params {
-    // Interface to operating system functionality.
-    Env* env;
+    explicit Params(IteratorContext* ctx)
+        : allocator_getter(ctx->allocator_getter()),
+          env(ctx->env()),
+          function_library(ctx->function_library()),
+          lib(ctx->lib()),
+          model(ctx->model()),
+          runner(*(ctx->runner())),
+          runner_threadpool_size(ctx->runner_threadpool_size()),
+          stats_aggregator(ctx->stats_aggregator()) {}
 
-    // Function call support.
-    std::function<void(std::function<void()>)> runner = nullptr;
-
-    // The `StatsAggregator` object to record statistics about the iterator.
-    std::shared_ptr<StatsAggregator> stats_aggregator = nullptr;
-
-    // The FunctionLibraryRuntime object to be used to make function calls.
-    FunctionLibraryRuntime* lib = nullptr;
-    std::shared_ptr<const FunctionLibraryDefinition> function_library = nullptr;
+    explicit Params(OpKernelContext* ctx)
+        : env(ctx->env()),
+          lib(ctx->function_library()),
+          runner(*(ctx->runner())),
+          runner_threadpool_size(
+              ctx->device()->tensorflow_cpu_worker_threads()->num_threads) {
+      // NOTE: need reinterpret_cast because function.h forward-declares Device.
+      DeviceBase* device =
+          reinterpret_cast<DeviceBase*>(ctx->function_library()->device());
+      allocator_getter = [device](AllocatorAttributes attrs) {
+        return device->GetAllocator(attrs);
+      };
+    }
 
     // The Allocator to be used to allocate the output of an iterator.
     std::function<Allocator*(AllocatorAttributes)> allocator_getter = nullptr;
 
+    // Interface to operating system functionality.
+    Env* env = nullptr;
+
+    // The FunctionLibraryDefinition used to look up user-defined functions.
+    std::shared_ptr<const FunctionLibraryDefinition> function_library = nullptr;
+
+    // The FunctionLibraryRuntime object to be used to make function calls.
+    FunctionLibraryRuntime* lib = nullptr;
+
     // If non-null, identifies the object used for performance modeling.
     std::shared_ptr<model::Model> model = nullptr;
+
+    // Function call support.
+    std::function<void(std::function<void()>)> runner = nullptr;
+
+    // Number of threads used for executing user-defined functions.
+    int32 runner_threadpool_size = 0;
+
+    // The `StatsAggregator` object to record statistics about the iterator.
+    std::shared_ptr<StatsAggregator> stats_aggregator = nullptr;
   };
 
+  explicit IteratorContext(IteratorContext* ctx) : params_(Params{ctx}) {}
+
+  explicit IteratorContext(OpKernelContext* ctx) : params_(Params{ctx}) {}
+
   explicit IteratorContext(Params params) : params_(std::move(params)) {}
 
-  explicit IteratorContext(OpKernelContext* ctx) {
-    params_.env = ctx->env();
-    params_.runner = *(ctx->runner());
-    params_.lib = ctx->function_library();
-    // NOTE: must use reinterpret_cast because function.h forward-declares
-    // Device.
-    DeviceBase* device =
-        reinterpret_cast<DeviceBase*>(ctx->function_library()->device());
-    params_.allocator_getter = [device](AllocatorAttributes attrs) {
-      return device->GetAllocator(attrs);
-    };
-  }
-
-  Env* env() const { return params_.env; }
-
-  std::function<void(std::function<void()>)>* runner() {
-    return &params_.runner;
-  }
-
-
-  std::shared_ptr<const FunctionLibraryDefinition> function_library() {
-    return params_.function_library;
-  }
-
-  FunctionLibraryRuntime* lib() { return params_.lib; }
-
-  void set_lib(FunctionLibraryRuntime* lib) { params_.lib = lib; }
-
   Allocator* allocator(AllocatorAttributes attrs) {
     return params_.allocator_getter(attrs);
   }
@@ -330,12 +335,26 @@ class IteratorContext {
     return params_.allocator_getter;
   }
 
+  Env* env() const { return params_.env; }
+
+  std::shared_ptr<const FunctionLibraryDefinition> function_library() {
+    return params_.function_library;
+  }
+
+  FunctionLibraryRuntime* lib() { return params_.lib; }
+
+  const std::shared_ptr<model::Model>& model() { return params_.model; }
+
+  std::function<void(std::function<void()>)>* runner() {
+    return &params_.runner;
+  }
+
+  int32 runner_threadpool_size() { return params_.runner_threadpool_size; }
+
   std::shared_ptr<StatsAggregator> stats_aggregator() {
     return params_.stats_aggregator;
   }
 
-  std::shared_ptr<model::Model> model() { return params_.model; }
-
   Params params() { return params_; }
 
  private:
@@ -460,6 +479,7 @@ class IteratorBase {
 
  private:
   friend class DatasetBase;  // for access to `AddCleanupFunction`
+  friend class DatasetBaseIterator;  // for access to `node_`
 
   // Registers a cleanup function to be called upon object destruction.
   //
@@ -468,7 +488,11 @@ class IteratorBase {
     cleanup_fns_.push_back(std::move(cleanup_fn));
   }
 
+  // Associates the given performance modeling `Node` with this iterator.
+  void SetNode(std::shared_ptr<model::Node> node) { node_ = node.get(); }
+
   std::vector<std::function<void()>> cleanup_fns_;
+  model::Node* node_ = nullptr;  // Not owned.
 };
 
 // Represents runtime information needed to construct a dataset.
@@ -518,11 +542,10 @@ class DatasetBase : public core::RefCounted {
   Status MakeIterator(IteratorContext* ctx, const string& output_prefix,
                       std::unique_ptr<IteratorBase>* iterator) const {
     *iterator = MakeIteratorInternal(output_prefix);
-    std::shared_ptr<model::Model> model = ctx->model();
-    if (model) {
+    if (const auto& model = ctx->model()) {
       const string& prefix = (*iterator)->prefix();
-      model->AddNode(MakeNodeFactory(ctx, iterator->get()), prefix,
-                     output_prefix);
+      (*iterator)->SetNode(model->AddNode(MakeNodeFactory(ctx, iterator->get()),
+                                          prefix, output_prefix));
       (*iterator)->AddCleanupFunction(
           [model, prefix]() { model->RemoveNode(prefix); });
     }
@@ -652,24 +675,32 @@ class DatasetBaseIterator : public IteratorBase {
   // When performance modeling is enabled, this method records the fact that
   // this iterator has produced an element.
   void RecordElement(IteratorContext* ctx) {
-    if (ctx->model()) {
-      ctx->model()->RecordElement(prefix());
+    if (node_) {
+      node_->record_element();
     }
   }
 
   // When performance modeling is enabled, this method records the fact that
   // a thread of this iterator has started work.
   void RecordStart(IteratorContext* ctx, bool stop_output = false) {
-    if (ctx->model()) {
-      ctx->model()->RecordStart(prefix(), stop_output);
+    if (node_) {
+      int64 now_nanos = Env::Default()->NowNanos();
+      if (stop_output && node_->output()) {
+        node_->output()->record_stop(now_nanos);
+      }
+      node_->record_start(now_nanos);
     }
   }
 
   // When performance modeling is enabled, this method records the fact that
   // a thread of this iterator has stopped work.
   void RecordStop(IteratorContext* ctx, bool start_output = false) {
-    if (ctx->model()) {
-      ctx->model()->RecordStop(prefix(), start_output);
+    if (node_) {
+      int64 now_nanos = Env::Default()->NowNanos();
+      node_->record_stop(now_nanos);
+      if (start_output && node_->output()) {
+        node_->output()->record_start(now_nanos);
+      }
     }
   }
 
diff --git a/tensorflow/core/framework/function.cc b/tensorflow/core/framework/function.cc
index abd0930ca9e..9d0933e680d 100644
--- a/tensorflow/core/framework/function.cc
+++ b/tensorflow/core/framework/function.cc
@@ -149,8 +149,8 @@ class FunctionInstantiationHelper {
   }
 
   // Builds index for nodes that can be used as node's input arguments.
-  Status BuildInputArgIndex(const OpDef::ArgDef& arg_def,
-                            AttrSlice attr_values) {
+  Status BuildInputArgIndex(const OpDef::ArgDef& arg_def, AttrSlice attr_values,
+                            bool ints_on_device) {
     bool is_type_list;
     DataTypeVector dtypes;
     TF_RETURN_IF_ERROR(
@@ -169,7 +169,11 @@ class FunctionInstantiationHelper {
         strings::StrAppend(&name, "_", i);
       }
       NodeDef* gnode = AddNode(name);
-      gnode->set_op(FunctionLibraryDefinition::kArgOp);
+      if (ints_on_device && dtypes[i] == DataType::DT_INT32) {
+        gnode->set_op(FunctionLibraryDefinition::kDeviceArgOp);
+      } else {
+        gnode->set_op(FunctionLibraryDefinition::kArgOp);
+      }
       AddAttr("T", dtypes[i], gnode);
       AddAttr("index", arg_index, gnode);
       result_.arg_types.push_back(dtypes[i]);
@@ -564,9 +568,11 @@ string Print(gtl::ArraySlice<const NodeDef*> nodes) {
   std::vector<const NodeDef*> ret;
   std::vector<const NodeDef*> body;
   for (const NodeDef* n : nodes) {
-    if (n->op() == FunctionLibraryDefinition::kArgOp) {
+    if (n->op() == FunctionLibraryDefinition::kArgOp ||
+        n->op() == FunctionLibraryDefinition::kDeviceArgOp) {
       arg.push_back(n);
-    } else if (n->op() == FunctionLibraryDefinition::kRetOp) {
+    } else if (n->op() == FunctionLibraryDefinition::kRetOp ||
+               n->op() == FunctionLibraryDefinition::kDeviceRetOp) {
       ret.push_back(n);
     } else {
       body.push_back(n);
@@ -638,10 +644,13 @@ Status InstantiateFunction(const FunctionDef& fdef, AttrSlice attr_values,
   const OpDef& sig = fdef.signature();
   TF_RETURN_IF_ERROR(ValidateSignatureWithAttrs(sig, attr_values));
 
+  bool ints_on_device = fdef.attr().count("experimental_ints_on_device") != 0 &&
+                        fdef.attr().at("experimental_ints_on_device").b();
+
   FunctionInstantiationHelper helper(get_function, result);
   Status s;
   for (const OpDef::ArgDef& arg_def : sig.input_arg()) {
-    s = helper.BuildInputArgIndex(arg_def, attr_values);
+    s = helper.BuildInputArgIndex(arg_def, attr_values, ints_on_device);
     if (!s.ok()) {
       errors::AppendToMessage(&s, "In ", Print(arg_def));
       return s;
@@ -693,9 +702,6 @@ Status InstantiateFunction(const FunctionDef& fdef, AttrSlice attr_values,
     }
   }
 
-  bool ints_on_device = fdef.attr().count("experimental_ints_on_device") != 0 &&
-                        fdef.attr().at("experimental_ints_on_device").b();
-
   // Emits nodes for the function's return values.
   int ret_index = 0;
   for (const OpDef::ArgDef& ret_def : sig.output_arg()) {
diff --git a/tensorflow/core/framework/function.h b/tensorflow/core/framework/function.h
index 40ace6ef815..4cc1b858e3a 100644
--- a/tensorflow/core/framework/function.h
+++ b/tensorflow/core/framework/function.h
@@ -379,6 +379,7 @@ class FunctionLibraryDefinition : public OpRegistryInterface {
   // Ops created for function arguments bear the name given by `kArgOp`; those
   // created for return values bear the name given by `kRetOp`.
   static constexpr const char* const kArgOp = "_Arg";
+  static constexpr const char* const kDeviceArgOp = "_DeviceArg";
   static constexpr const char* const kRetOp = "_Retval";
   static constexpr const char* const kDeviceRetOp = "_DeviceRetval";
 
diff --git a/tensorflow/core/framework/kernel_def.proto b/tensorflow/core/framework/kernel_def.proto
index e16c2ae73bd..358621dc0f5 100644
--- a/tensorflow/core/framework/kernel_def.proto
+++ b/tensorflow/core/framework/kernel_def.proto
@@ -33,6 +33,11 @@ message KernelDef {
   // won't be used unless the user specifies a "_kernel" attr with
   // value matching this.
   string label = 5;
+
+  // Prioritization of kernel amongst different devices. By default we assume
+  // priority is 0. The higher the priority the better. By default (i.e. if
+  // this is not set), we prefer GPU kernels over CPU.
+  int32 priority = 6;
 }
 
 // A collection of KernelDefs
diff --git a/tensorflow/core/framework/kernel_def_builder.cc b/tensorflow/core/framework/kernel_def_builder.cc
index eb86f18ff06..fcacc3bebba 100644
--- a/tensorflow/core/framework/kernel_def_builder.cc
+++ b/tensorflow/core/framework/kernel_def_builder.cc
@@ -66,6 +66,11 @@ KernelDefBuilder& KernelDefBuilder::Label(const char* label) {
   return *this;
 }
 
+KernelDefBuilder& KernelDefBuilder::Priority(int32 priority) {
+  kernel_def_->set_priority(priority);
+  return *this;
+}
+
 const KernelDef* KernelDefBuilder::Build() {
   KernelDef* r = kernel_def_;
   kernel_def_ = nullptr;
diff --git a/tensorflow/core/framework/kernel_def_builder.h b/tensorflow/core/framework/kernel_def_builder.h
index 32dd21f94e0..d74453cf606 100644
--- a/tensorflow/core/framework/kernel_def_builder.h
+++ b/tensorflow/core/framework/kernel_def_builder.h
@@ -64,6 +64,9 @@ class KernelDefBuilder {
   // "_kernel" attr.  May only be specified once.  Returns *this.
   KernelDefBuilder& Label(const char* label);
 
+  // Specify a priority number for this kernel.
+  KernelDefBuilder& Priority(int32 priority);
+
   // Returns a pointer to a KernelDef with fields set based on the
   // above calls to this instance.
   // Caller takes ownership of the result.
diff --git a/tensorflow/core/framework/model.cc b/tensorflow/core/framework/model.cc
index 5650b4861b9..3bd5b725b86 100644
--- a/tensorflow/core/framework/model.cc
+++ b/tensorflow/core/framework/model.cc
@@ -330,8 +330,8 @@ std::shared_ptr<Node> MakeUnknownNode(Node::Args args) {
   return std::make_shared<Unknown>(std::move(args));
 }
 
-void Model::AddNode(Node::Factory factory, const string& name,
-                    const string& output_name) {
+std::shared_ptr<Node> Model::AddNode(Node::Factory factory, const string& name,
+                                     const string& output_name) {
   // The name captures the sequence of iterators joined by `::`. We use the full
   // sequence as the key in the lookup table, but only the last element of the
   // sequence as the name node.
@@ -357,6 +357,7 @@ void Model::AddNode(Node::Factory factory, const string& name,
     output->add_input(node);
   }
   lookup_table_.insert(std::make_pair(name, node));
+  return node;
 }
 
 void Model::AddProcessingTime(const string& name, int64 delta) {
@@ -441,10 +442,11 @@ void Model::RecordStart(const string& name, bool stop_output) {
   tf_shared_lock l(mu_);
   auto node = gtl::FindOrNull(lookup_table_, name);
   if (node) {
+    int64 now_nanos = Env::Default()->NowNanos();
     if (stop_output && (*node)->output()) {
-      (*node)->output()->record_stop();
+      (*node)->output()->record_stop(now_nanos);
     }
-    (*node)->record_start();
+    (*node)->record_start(now_nanos);
   }
 }
 
@@ -452,9 +454,10 @@ void Model::RecordStop(const string& name, bool start_output) {
   tf_shared_lock l(mu_);
   auto node = gtl::FindOrNull(lookup_table_, name);
   if (node) {
-    (*node)->record_stop();
+    int64 now_nanos = Env::Default()->NowNanos();
+    (*node)->record_stop(now_nanos);
     if (start_output && (*node)->output()) {
-      (*node)->output()->record_start();
+      (*node)->output()->record_start(now_nanos);
     }
   }
 }
diff --git a/tensorflow/core/framework/model.h b/tensorflow/core/framework/model.h
index 10ecdef5868..24aa5630cc3 100644
--- a/tensorflow/core/framework/model.h
+++ b/tensorflow/core/framework/model.h
@@ -18,7 +18,8 @@ limitations under the License.
 #include <list>
 #include <memory>
 #include <string>
-#include <thread>  // (b/114492873): move this include into core/platform
+// TODO(b/114492873): Move this include into core/platform.
+#include <thread>  // NOLINT
 #include <utility>
 #include <vector>
 
@@ -108,8 +109,8 @@ class Node {
 
   using Factory = std::function<std::shared_ptr<Node>(Args)>;
 
-  Node(Args args)
-      : id_(args.id), name_(args.name), output_(std::move(args.output)) {}
+  explicit Node(Args args)
+      : id_(args.id), name_(args.name), output_(args.output.get()) {}
 
   // Adds an input.
   void add_input(std::shared_ptr<Node> node) LOCKS_EXCLUDED(mu_) {
@@ -142,10 +143,7 @@ class Node {
   }
 
   // Returns the node output.
-  std::shared_ptr<Node> output() const LOCKS_EXCLUDED(mu_) {
-    tf_shared_lock l(mu_);
-    return output_;
-  }
+  Node* output() const { return output_; }
 
   // Returns the aggregate processing time.
   int64 processing_time() const LOCKS_EXCLUDED(mu_) {
@@ -160,19 +158,19 @@ class Node {
   }
 
   // Records that a node thread has started executing.
-  void record_start() LOCKS_EXCLUDED(mu_) {
+  void record_start(int64 time_nanos) LOCKS_EXCLUDED(mu_) {
     mutex_lock l(mu_);
-    work_start_[std::this_thread::get_id()] = Env::Default()->NowNanos();
+    work_start_[std::this_thread::get_id()] = time_nanos;
   }
 
   // Records that a node thread has stopped executing.
-  void record_stop() LOCKS_EXCLUDED(mu_) {
+  void record_stop(int64 time_nanos) LOCKS_EXCLUDED(mu_) {
     mutex_lock l(mu_);
     std::thread::id tid = std::this_thread::get_id();
-    auto start_time = gtl::FindOrNull(work_start_, tid);
-    if (start_time) {
-      processing_time_ += Env::Default()->NowNanos() - *start_time;
-      work_start_.erase(tid);
+    auto iter = work_start_.find(tid);
+    if (iter != work_start_.end()) {
+      processing_time_ += time_nanos - iter->second;
+      work_start_.erase(iter);
     } else {
       LOG(WARNING)
           << "Encountered a stop event that was not preceded by a start event.";
@@ -185,12 +183,6 @@ class Node {
     inputs_.remove(input);
   }
 
-  // Set the node output.
-  void set_output(std::shared_ptr<Node> output) LOCKS_EXCLUDED(mu_) {
-    mutex_lock l(mu_);
-    output_ = output;
-  }
-
   // Collects tunable parameters in the subtree rooted in this node.
   void CollectTunableParameters(
       std::vector<std::shared_ptr<Parameter>>* parameters) LOCKS_EXCLUDED(mu_) {
@@ -287,7 +279,10 @@ class Node {
   std::map<std::thread::id, int64> work_start_ GUARDED_BY(mu_);
   std::map<string, std::shared_ptr<Parameter>> parameters_ GUARDED_BY(mu_);
   std::list<std::shared_ptr<Node>> inputs_ GUARDED_BY(mu_);
-  std::shared_ptr<Node> output_ GUARDED_BY(mu_);
+
+  // The reference to the output node is not owned so that that deletion of a
+  // node results in recursive deletion of the subtree rooted in the node.
+  Node* const output_;
 };
 
 // InterleaveMany is used to model datasets whose inputs are used to create
@@ -337,8 +332,8 @@ class Model {
   Model() = default;
 
   // Adds a node with the given name and given output.
-  void AddNode(Node::Factory factory, const string& name,
-               const string& output_name) LOCKS_EXCLUDED(mu_);
+  std::shared_ptr<Node> AddNode(Node::Factory factory, const string& name,
+                                const string& output_name) LOCKS_EXCLUDED(mu_);
 
   // Increments the processing time for the given node..
   void AddProcessingTime(const string& name, int64 delta) LOCKS_EXCLUDED(mu_);
diff --git a/tensorflow/core/framework/model_test.cc b/tensorflow/core/framework/model_test.cc
index 02e27107fb0..53e35f25b28 100644
--- a/tensorflow/core/framework/model_test.cc
+++ b/tensorflow/core/framework/model_test.cc
@@ -101,15 +101,9 @@ TEST_P(AsyncKnownRatioTest, Model) {
   std::shared_ptr<Node> source1 =
       model::MakeSourceNode({1, "source1", async_known_many});
   async_known_many->add_input(source1);
-  auto cleanup1 = gtl::MakeCleanup([async_known_many, source1]() {
-    async_known_many->remove_input(source1);
-  });
   std::shared_ptr<Node> source2 =
       model::MakeSourceNode({2, "source2", async_known_many});
   async_known_many->add_input(source2);
-  auto cleanup2 = gtl::MakeCleanup([async_known_many, source2]() {
-    async_known_many->remove_input(source2);
-  });
   std::vector<int64> input_times(1, input_time);
   source1->add_processing_time(100);
   EXPECT_EQ(0, async_known_many->ProcessingTime());
@@ -166,19 +160,12 @@ TEST(InterleaveManyTest, Model) {
   std::shared_ptr<Node> meta_source =
       model::MakeSourceNode({1, "meta_source", interleave_many});
   interleave_many->add_input(meta_source);
-  auto cleanup_meta = gtl::MakeCleanup([interleave_many, meta_source]() {
-    interleave_many->remove_input(meta_source);
-  });
   std::shared_ptr<Node> source1 =
       model::MakeSourceNode({1, "source1", interleave_many});
   interleave_many->add_input(source1);
-  auto cleanup1 = gtl::MakeCleanup(
-      [interleave_many, source1]() { interleave_many->remove_input(source1); });
   std::shared_ptr<Node> source2 =
       model::MakeSourceNode({2, "source2", interleave_many});
   interleave_many->add_input(source2);
-  auto cleanup2 = gtl::MakeCleanup(
-      [interleave_many, source2]() { interleave_many->remove_input(source2); });
   std::vector<int64> input_times(1, 0);
   interleave_many->add_processing_time(100);
   EXPECT_EQ(100, interleave_many->processing_time());
@@ -210,13 +197,9 @@ TEST_P(KnownRatioTest, Model) {
   std::shared_ptr<Node> source1 =
       model::MakeSourceNode({1, "source1", known_many});
   known_many->add_input(source1);
-  auto cleanup1 = gtl::MakeCleanup(
-      [known_many, source1]() { known_many->remove_input(source1); });
   std::shared_ptr<Node> source2 =
       model::MakeSourceNode({2, "source2", known_many});
   known_many->add_input(source2);
-  auto cleanup2 = gtl::MakeCleanup(
-      [known_many, source2]() { known_many->remove_input(source2); });
   std::vector<int64> input_times(1, 0);
   source1->add_processing_time(100);
   EXPECT_EQ(0, known_many->ProcessingTime());
@@ -280,13 +263,9 @@ TEST(UnknownRatioTest, Model) {
   std::shared_ptr<Node> source1 =
       model::MakeSourceNode({1, "source1", unknown_many});
   unknown_many->add_input(source1);
-  auto cleanup1 = gtl::MakeCleanup(
-      [unknown_many, source1]() { unknown_many->remove_input(source1); });
   std::shared_ptr<Node> source2 =
       model::MakeSourceNode({2, "source2", unknown_many});
   unknown_many->add_input(source2);
-  auto cleanup2 = gtl::MakeCleanup(
-      [unknown_many, source2]() { unknown_many->remove_input(source2); });
   std::vector<int64> input_times(1, 0);
   unknown_many->add_processing_time(100);
   EXPECT_EQ(100, unknown_many->processing_time());
@@ -315,13 +294,9 @@ TEST(UnknownTest, Model) {
   std::shared_ptr<Node> source1 =
       model::MakeSourceNode({1, "source1", unknown});
   unknown->add_input(source1);
-  auto cleanup1 = gtl::MakeCleanup(
-      [unknown, source1]() { unknown->remove_input(source1); });
   std::shared_ptr<Node> source2 =
       model::MakeSourceNode({2, "source2", unknown});
   unknown->add_input(source2);
-  auto cleanup2 = gtl::MakeCleanup(
-      [unknown, source2]() { unknown->remove_input(source2); });
   std::vector<int64> input_times(1, 0);
   source1->add_processing_time(100);
   EXPECT_EQ(0, unknown->ProcessingTime());
diff --git a/tensorflow/core/framework/op_kernel.cc b/tensorflow/core/framework/op_kernel.cc
index 1eb12d3f953..e2a177569d6 100644
--- a/tensorflow/core/framework/op_kernel.cc
+++ b/tensorflow/core/framework/op_kernel.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/framework/op_kernel.h"
 
-#include <mutex>
+#include <mutex>  // NOLINT
 #include <unordered_map>
 #include <utility>
 #include <vector>
@@ -286,6 +286,13 @@ OpKernelContext::~OpKernelContext() {
     }
   }
   if (params_->record_tensor_accesses) referenced_tensors_.Destroy();
+  if (params_->track_allocations && !wrapped_allocators_.empty()) {
+    LOG(WARNING) << "OpKernelContext is tracking allocations but they are not "
+                 << "being consumed by the StepStatsCollector.";
+    for (auto& wrapped_alloator : wrapped_allocators_) {
+      wrapped_alloator.second->GetRecordsAndUnRef();
+    }
+  }
 }
 
 Allocator* OpKernelContext::get_allocator(AllocatorAttributes attr) {
@@ -914,11 +921,12 @@ void OpKernelContext::clear_recorded_memory() {
 
 struct KernelRegistration {
   KernelRegistration(const KernelDef& d, StringPiece c,
-                     kernel_factory::OpKernelRegistrar::Factory f)
-      : def(d), kernel_class_name(c), factory(f) {}
+                     std::unique_ptr<kernel_factory::OpKernelFactory> f)
+      : def(d), kernel_class_name(c), factory(std::move(f)) {}
+
   const KernelDef def;
   const string kernel_class_name;
-  const kernel_factory::OpKernelRegistrar::Factory factory;
+  std::unique_ptr<kernel_factory::OpKernelFactory> factory;
 };
 
 // This maps from 'op_type' + DeviceType to the set of KernelDefs and
@@ -985,7 +993,7 @@ namespace kernel_factory {
 
 void OpKernelRegistrar::InitInternal(const KernelDef* kernel_def,
                                      StringPiece kernel_class_name,
-                                     Factory factory) {
+                                     std::unique_ptr<OpKernelFactory> factory) {
   // See comments in register_kernel::Name in header for info on _no_register.
   if (kernel_def->op() != "_no_register") {
     const string key =
@@ -1000,8 +1008,8 @@ void OpKernelRegistrar::InitInternal(const KernelDef* kernel_def,
     // program flakily. Until we get rid of static initializers in kernel
     // registration mechanism, we have this workaround here.
     reinterpret_cast<KernelRegistry*>(GlobalKernelRegistry())
-        ->insert(std::make_pair(
-            key, KernelRegistration(*kernel_def, kernel_class_name, factory)));
+        ->emplace(key, KernelRegistration(*kernel_def, kernel_class_name,
+                                          std::move(factory)));
   }
   delete kernel_def;
 }
@@ -1083,7 +1091,7 @@ Status FindKernelDef(const DeviceType& device_type, const NodeDef& node_def,
 
 Status SupportedDeviceTypesForNode(
     const std::vector<DeviceType>& prioritized_types, const NodeDef& def,
-    DeviceTypeVector* device_types) {
+    PrioritizedDeviceTypeVector* prioritized_device_types) {
   // TODO(zhifengc): Changes the callers (SimplePlacer and
   // DynamicPlacer) to consider the possibility that 'def' is call to
   // a user-defined function and only calls this
@@ -1096,12 +1104,21 @@ Status SupportedDeviceTypesForNode(
       bool was_attr_mismatch;
       TF_RETURN_IF_ERROR(
           FindKernelRegistration(device_type, def, &reg, &was_attr_mismatch));
-      if (reg != nullptr) device_types->push_back(device_type);
+      if (reg != nullptr) {
+        int32 priority = reg->def.priority();
+        prioritized_device_types->emplace_back(device_type, priority);
+      }
     }
+    std::sort(prioritized_device_types->begin(),
+              prioritized_device_types->end(),
+              [](const std::pair<DeviceType, int32>& a,
+                 const std::pair<DeviceType, int32>& b) {
+                return a.second > b.second;
+              });
   } else {
     // Assumes that all device types support this node.
     for (const DeviceType& device_type : prioritized_types) {
-      device_types->push_back(device_type);
+      prioritized_device_types->push_back(std::make_pair(device_type, 0));
     }
   }
   return Status::OK();
@@ -1225,7 +1242,7 @@ Status CreateOpKernel(DeviceType device_type, DeviceBase* device,
   OpKernelConstruction context(
       device_type, device, allocator, &node_def, op_def, flib, inputs,
       input_memory_types, outputs, output_memory_types, graph_def_version, &s);
-  *kernel = (*registration->factory)(&context);
+  *kernel = registration->factory->Create(&context);
   if (!s.ok()) {
     delete *kernel;
     *kernel = nullptr;
diff --git a/tensorflow/core/framework/op_kernel.h b/tensorflow/core/framework/op_kernel.h
index 6c71e118c02..9f4c57e880a 100644
--- a/tensorflow/core/framework/op_kernel.h
+++ b/tensorflow/core/framework/op_kernel.h
@@ -982,9 +982,10 @@ class OpKernelContext {
     return params_->output_attr_array[index];
   }
 
-  gtl::InlinedVector<WrappedAllocator, 4> wrapped_allocators() const {
+  gtl::InlinedVector<WrappedAllocator, 4> ConsumeWrappedAllocators() {
     mutex_lock lock(mu_);
-    gtl::InlinedVector<WrappedAllocator, 4> retrieved = wrapped_allocators_;
+    gtl::InlinedVector<WrappedAllocator, 4> retrieved;
+    retrieved.swap(wrapped_allocators_);
     return retrieved;
   }
 
@@ -1236,7 +1237,7 @@ Status CreateOpKernel(DeviceType device_type, DeviceBase* device,
 //           * def has all attrs specified (e.g. using AddDefaultsToNodeDef()).
 Status SupportedDeviceTypesForNode(
     const std::vector<DeviceType>& prioritized_types, const NodeDef& def,
-    DeviceTypeVector* device_types);
+    PrioritizedDeviceTypeVector* device_types);
 
 // Returns a message with a description of the kernels registered for op
 // `op_name`.
@@ -1345,22 +1346,55 @@ KernelList GetRegisteredKernelsForOp(StringPiece op_name);
 
 namespace kernel_factory {
 
+// OpKernelFactory is responsible for creating OpKernels when TensorFlow needs
+// them. You register factories with the TensorFlow core by constructing an
+// OpKernelRegistrar and passing the factory as a constructor parameter.
+class OpKernelFactory {
+ public:
+  virtual OpKernel* Create(OpKernelConstruction* context) = 0;
+  virtual ~OpKernelFactory() = default;
+};
+
 class OpKernelRegistrar {
  public:
-  typedef OpKernel* (*Factory)(OpKernelConstruction*);
-
+  // Registers the given kernel factory with TensorFlow. TF will call the
+  // factory Create() method when it determines that a kernel matching the given
+  // KernelDef is required.
   OpKernelRegistrar(const KernelDef* kernel_def, StringPiece kernel_class_name,
-                    Factory factory) {
+                    std::unique_ptr<OpKernelFactory> factory) {
     // Perform the check in the header to allow compile-time optimization
     // to a no-op, allowing the linker to remove the kernel symbols.
     if (kernel_def != nullptr) {
-      InitInternal(kernel_def, kernel_class_name, factory);
+      InitInternal(kernel_def, kernel_class_name, std::move(factory));
+    }
+  }
+
+  // Registers the given factory function with TensorFlow. This is equivalent
+  // to registering a factory whose Create function invokes `create_fn`.
+  OpKernelRegistrar(const KernelDef* kernel_def, StringPiece kernel_class_name,
+                    OpKernel* (*create_fn)(OpKernelConstruction*)) {
+    // Perform the check in the header to allow compile-time optimization
+    // to a no-op, allowing the linker to remove the kernel symbols.
+    if (kernel_def != nullptr) {
+      struct PtrOpKernelFactory : public OpKernelFactory {
+        explicit PtrOpKernelFactory(
+            OpKernel* (*create_func)(OpKernelConstruction*))
+            : create_func_(create_func) {}
+
+        OpKernel* Create(OpKernelConstruction* context) override {
+          return (*create_func_)(context);
+        }
+
+        OpKernel* (*create_func_)(OpKernelConstruction*);
+      };
+      InitInternal(kernel_def, kernel_class_name,
+                   absl::make_unique<PtrOpKernelFactory>(create_fn));
     }
   }
 
  private:
   void InitInternal(const KernelDef* kernel_def, StringPiece kernel_class_name,
-                    Factory factory);
+                    std::unique_ptr<OpKernelFactory> factory);
 };
 
 }  // namespace kernel_factory
diff --git a/tensorflow/core/framework/op_kernel_test.cc b/tensorflow/core/framework/op_kernel_test.cc
index 83dda6579b7..d8001cd0710 100644
--- a/tensorflow/core/framework/op_kernel_test.cc
+++ b/tensorflow/core/framework/op_kernel_test.cc
@@ -102,6 +102,27 @@ REGISTER_OP("Test4").Input("i: float").Output("o: float");
 REGISTER_KERNEL_BUILDER(Name("Test4").Device(DEVICE_CPU), DummyKernel);
 REGISTER_KERNEL_BUILDER(Name("Test4").Device(DEVICE_GPU), DummyKernel);
 
+// Kernels with different priorities.
+REGISTER_OP("Test5").Input("a: T").Input("b: T").Attr("T: type");
+
+class TestOp5Cpu : public tensorflow::OpKernel {
+ public:
+  explicit TestOp5Cpu(OpKernelConstruction* context) : OpKernel(context) {}
+  void Compute(OpKernelContext* context) override {}
+};
+
+REGISTER_KERNEL_BUILDER(Name("Test5").Device(DEVICE_CPU).Priority(2),
+                        TestOp5Cpu);
+
+class TestOp5Gpu : public tensorflow::OpKernel {
+ public:
+  explicit TestOp5Gpu(OpKernelConstruction* context) : OpKernel(context) {}
+  void Compute(OpKernelContext* context) override {}
+};
+
+REGISTER_KERNEL_BUILDER(Name("Test5").Device(DEVICE_GPU).Priority(1),
+                        TestOp5Gpu);
+
 static std::vector<DeviceType> DeviceTypes() {
   return {DeviceType(DEVICE_GPU), DeviceType(DEVICE_CPU)};
 }
@@ -185,10 +206,10 @@ TEST_F(OpKernelTest, SuccessBothCpuAndGpu) {
 
 TEST_F(OpKernelTest, CpuTypeRegistered) {
   NodeDef ndef = CreateNodeDef("Test1", {DT_FLOAT, DT_INT32});
-  DeviceTypeVector devs;
+  PrioritizedDeviceTypeVector devs;
   TF_ASSERT_OK(SupportedDeviceTypesForNode(DeviceTypes(), ndef, &devs));
   EXPECT_EQ(1, devs.size());
-  EXPECT_EQ(DeviceType(DEVICE_CPU), devs[0]);
+  EXPECT_EQ(DeviceType(DEVICE_CPU), devs[0].first);
 }
 
 TEST_F(OpKernelTest, CpuAndGpuTypeRegistered) {
@@ -196,24 +217,24 @@ TEST_F(OpKernelTest, CpuAndGpuTypeRegistered) {
     // Try a node def of an op that is registered for a specific type
     // only on CPU.
     NodeDef ndef = CreateNodeDef("Test3", {DT_INT8, DT_INT8});
-    DeviceTypeVector devs;
+    PrioritizedDeviceTypeVector devs;
     TF_ASSERT_OK(SupportedDeviceTypesForNode(DeviceTypes(), ndef, &devs));
     EXPECT_EQ(1, devs.size());
-    EXPECT_EQ(DeviceType(DEVICE_CPU), devs[0]);
+    EXPECT_EQ(DeviceType(DEVICE_CPU), devs[0].first);
   }
   {
     // Try a node def of an op that is registered for a specific type
     // only on GPU.
     NodeDef ndef = CreateNodeDef("Test3", {DT_FLOAT, DT_FLOAT});
-    DeviceTypeVector devs;
+    PrioritizedDeviceTypeVector devs;
     TF_ASSERT_OK(SupportedDeviceTypesForNode(DeviceTypes(), ndef, &devs));
     EXPECT_EQ(1, devs.size());
-    EXPECT_EQ(DeviceType(DEVICE_GPU), devs[0]);
+    EXPECT_EQ(DeviceType(DEVICE_GPU), devs[0].first);
   }
   {
     // Try a node def of an op that is only registered for other types.
     NodeDef ndef = CreateNodeDef("Test3", {DT_STRING, DT_STRING});
-    DeviceTypeVector devs;
+    PrioritizedDeviceTypeVector devs;
     TF_ASSERT_OK(SupportedDeviceTypesForNode(DeviceTypes(), ndef, &devs));
     EXPECT_EQ(0, devs.size());
   }
@@ -221,11 +242,23 @@ TEST_F(OpKernelTest, CpuAndGpuTypeRegistered) {
   {
     // Try a node def of an op that is registered for both.
     NodeDef ndef = CreateNodeDef("Test4", {DT_FLOAT});
-    DeviceTypeVector devs;
+    PrioritizedDeviceTypeVector devs;
     TF_ASSERT_OK(SupportedDeviceTypesForNode(DeviceTypes(), ndef, &devs));
     EXPECT_EQ(2, devs.size());
-    EXPECT_EQ(DeviceType(DEVICE_GPU), devs[0]);
-    EXPECT_EQ(DeviceType(DEVICE_CPU), devs[1]);
+    EXPECT_EQ(DeviceType(DEVICE_GPU), devs[0].first);
+    EXPECT_EQ(DeviceType(DEVICE_CPU), devs[1].first);
+  }
+
+  {
+    // Try a node def of an op where kernels have priorities.
+    NodeDef ndef = CreateNodeDef("Test5", {DT_STRING, DT_STRING});
+    PrioritizedDeviceTypeVector devs;
+    TF_ASSERT_OK(SupportedDeviceTypesForNode(DeviceTypes(), ndef, &devs));
+    EXPECT_EQ(2, devs.size());
+    EXPECT_EQ(DeviceType(DEVICE_CPU), devs[0].first);
+    EXPECT_EQ(2, devs[0].second);
+    EXPECT_EQ(DeviceType(DEVICE_GPU), devs[1].first);
+    EXPECT_EQ(1, devs[1].second);
   }
 }
 
@@ -412,11 +445,11 @@ class OpKernelBuilderTest : public ::testing::Test {
     }
 
     // Test SupportedDeviceTypesForNode()
-    DeviceTypeVector devices;
+    PrioritizedDeviceTypeVector devices;
     TF_EXPECT_OK(SupportedDeviceTypesForNode(DeviceTypes(), def, &devices));
     bool found = false;
-    for (const DeviceType& dt : devices) {
-      if (dt == device_type) {
+    for (const auto& dt : devices) {
+      if (dt.first == device_type) {
         found = true;
       }
     }
@@ -445,11 +478,11 @@ class OpKernelBuilderTest : public ::testing::Test {
       EXPECT_EQ(code, status.code());
 
       // Test SupportedDeviceTypesForNode().
-      DeviceTypeVector devices;
+      PrioritizedDeviceTypeVector devices;
       if (errors::IsNotFound(status)) {
         TF_EXPECT_OK(SupportedDeviceTypesForNode(DeviceTypes(), def, &devices));
-        for (const DeviceType& dt : devices) {
-          EXPECT_NE(dt, device_type);
+        for (const auto& dt : devices) {
+          EXPECT_NE(dt.first, device_type);
         }
       } else {
         Status status2 =
@@ -562,7 +595,7 @@ REGISTER_KERNEL_BUILDER(Name("DuplicateKernel").Device(DEVICE_CPU),
 
 TEST_F(OpKernelBuilderTest, DuplicateKernel) {
   const NodeDef ndef = CreateNodeDef("DuplicateKernel", {});
-  DeviceTypeVector devs;
+  PrioritizedDeviceTypeVector devs;
   Status status = SupportedDeviceTypesForNode(DeviceTypes(), ndef, &devs);
   ASSERT_FALSE(status.ok());
   EXPECT_TRUE(str_util::StrContains(
@@ -582,7 +615,7 @@ REGISTER_KERNEL_BUILDER(
 TEST_F(OpKernelBuilderTest, DuplicateKernelForT) {
   const NodeDef ndef =
       CreateNodeDef("DuplicateKernelForT", {"T|type|DT_FLOAT"});
-  DeviceTypeVector devs;
+  PrioritizedDeviceTypeVector devs;
   Status status = SupportedDeviceTypesForNode(DeviceTypes(), ndef, &devs);
   ASSERT_FALSE(status.ok());
   EXPECT_TRUE(str_util::StrContains(
@@ -603,7 +636,7 @@ REGISTER_KERNEL_BUILDER(Name("BadConstraint")
 
 TEST_F(OpKernelBuilderTest, BadConstraint) {
   const NodeDef ndef = CreateNodeDef("BadConstraint", {});
-  DeviceTypeVector devs;
+  PrioritizedDeviceTypeVector devs;
   Status status = SupportedDeviceTypesForNode(DeviceTypes(), ndef, &devs);
   ASSERT_FALSE(status.ok());
   EXPECT_TRUE(
diff --git a/tensorflow/core/framework/run_handler.cc b/tensorflow/core/framework/run_handler.cc
index 0c4007eafce..55790b6e526 100644
--- a/tensorflow/core/framework/run_handler.cc
+++ b/tensorflow/core/framework/run_handler.cc
@@ -92,6 +92,27 @@ class RunHandlerPool::Impl {
       handlers_.emplace_back(new RunHandler::Impl(this));
       free_handlers_.push_back(handlers_.back().get());
     }
+    // Set steal partitions to a fixed size steal domain of size 6 = 2 *
+    // kMinThreadsPerRequest.
+    std::vector<std::pair<unsigned, unsigned>> steal_partitions(
+        num_inter_op_threads);
+    int kStealDomainSize = std::min(6, num_inter_op_threads);
+    unsigned steal_start = 0, steal_end = kStealDomainSize;
+    for (int i = 0; i < num_inter_op_threads; ++i) {
+      if (i > steal_start) {
+        if (steal_end + kStealDomainSize < num_inter_op_threads) {
+          steal_start = steal_end;
+          steal_end += kStealDomainSize;
+        } else {
+          steal_end = num_inter_op_threads;
+          steal_start = steal_end - kStealDomainSize;
+        }
+      }
+      steal_partitions[i] = std::make_pair(steal_start, steal_end);
+      VLOG(1) << "Steal partition i: " << i << " steal_start: " << steal_start
+              << " steal_end: " << steal_end;
+    }
+    inter_op_thread_pool_->SetStealPartitions(steal_partitions);
   }
 
   ~Impl() {
@@ -223,7 +244,9 @@ void RunHandlerPool::Impl::RecomputePoolStatsLocked() {
 void RunHandler::Impl::ScheduleInterOpClosure(std::function<void()> fn) {
   std::uint_fast32_t start = 0, limit = 0;
   DecodePartition(inter_op_scheduling_range(), &start, &limit);
-  pool_impl_->inter_op_thread_pool()->Schedule(std::move(fn));
+  DCHECK_LT(start, limit);
+  pool_impl_->inter_op_thread_pool()->ScheduleWithHint(std::move(fn), start,
+                                                       limit);
 }
 
 void RunHandler::Impl::Reset() {
diff --git a/tensorflow/core/framework/tensor.cc b/tensorflow/core/framework/tensor.cc
index 1dea6da9113..c7ddc6c21ed 100644
--- a/tensorflow/core/framework/tensor.cc
+++ b/tensorflow/core/framework/tensor.cc
@@ -752,6 +752,13 @@ Tensor::Tensor(Allocator* a, DataType type, const TensorShape& shape,
 Tensor::Tensor(DataType type, const TensorShape& shape)
     : Tensor(cpu_allocator(), type, shape) {}
 
+void Tensor::HostScalarTensorBufferBase::FillAllocationDescription(
+    AllocationDescription* proto) const {
+  proto->set_requested_bytes(size());
+  proto->set_allocator_name("HostScalarTensorBuffer");
+  proto->set_ptr(reinterpret_cast<uintptr_t>(data()));
+}
+
 template <typename T>
 class SubBuffer : public TensorBuffer {
  public:
diff --git a/tensorflow/core/framework/tensor.h b/tensorflow/core/framework/tensor.h
index d0f9eb56e23..0d58ab3875a 100644
--- a/tensorflow/core/framework/tensor.h
+++ b/tensorflow/core/framework/tensor.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_FRAMEWORK_TENSOR_H_
 #define TENSORFLOW_CORE_FRAMEWORK_TENSOR_H_
 
+#include <cstdint>
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/tensor_shape.h"
@@ -110,6 +111,76 @@ class Tensor {
   /// for details.
   explicit Tensor(DataType type);
 
+ private:
+  // A tag type for selecting the `Tensor` constructor overload that creates a
+  // scalar tensor in host memory.
+  struct host_scalar_tag {};
+
+  class HostScalarTensorBufferBase;
+  template <typename T>
+  class HostScalarTensorBuffer;
+
+  // Creates a tensor with the given scalar `value` in CPU memory.
+  template <typename T>
+  Tensor(T value, host_scalar_tag tag);
+
+ public:
+  // A series of specialized constructors for scalar tensors in host memory.
+  //
+  // NOTE: The `Variant` host-scalar constructor is not defined, because Variant
+  // is implicitly constructible from many different types, and this causes
+  // ambiguities with some compilers.
+  explicit Tensor(float scalar_value)
+      : Tensor(scalar_value, host_scalar_tag{}) {}
+  explicit Tensor(double scalar_value)
+      : Tensor(scalar_value, host_scalar_tag{}) {}
+  explicit Tensor(int32 scalar_value)
+      : Tensor(scalar_value, host_scalar_tag{}) {}
+  explicit Tensor(uint32 scalar_value)
+      : Tensor(scalar_value, host_scalar_tag{}) {}
+  explicit Tensor(uint16 scalar_value)
+      : Tensor(scalar_value, host_scalar_tag{}) {}
+  explicit Tensor(uint8 scalar_value)
+      : Tensor(scalar_value, host_scalar_tag{}) {}
+  explicit Tensor(int16 scalar_value)
+      : Tensor(scalar_value, host_scalar_tag{}) {}
+  explicit Tensor(int8 scalar_value)
+      : Tensor(scalar_value, host_scalar_tag{}) {}
+  explicit Tensor(string scalar_value)
+      : Tensor(std::move(scalar_value), host_scalar_tag{}) {}
+  explicit Tensor(complex64 scalar_value)
+      : Tensor(scalar_value, host_scalar_tag{}) {}
+  explicit Tensor(complex128 scalar_value)
+      : Tensor(scalar_value, host_scalar_tag{}) {}
+  explicit Tensor(int64 scalar_value)
+      : Tensor(scalar_value, host_scalar_tag{}) {}
+  explicit Tensor(uint64 scalar_value)
+      : Tensor(scalar_value, host_scalar_tag{}) {}
+  explicit Tensor(bool scalar_value)
+      : Tensor(scalar_value, host_scalar_tag{}) {}
+  explicit Tensor(qint8 scalar_value)
+      : Tensor(scalar_value, host_scalar_tag{}) {}
+  explicit Tensor(quint8 scalar_value)
+      : Tensor(scalar_value, host_scalar_tag{}) {}
+  explicit Tensor(qint16 scalar_value)
+      : Tensor(scalar_value, host_scalar_tag{}) {}
+  explicit Tensor(quint16 scalar_value)
+      : Tensor(scalar_value, host_scalar_tag{}) {}
+  explicit Tensor(qint32 scalar_value)
+      : Tensor(scalar_value, host_scalar_tag{}) {}
+  explicit Tensor(bfloat16 scalar_value)
+      : Tensor(scalar_value, host_scalar_tag{}) {}
+  explicit Tensor(Eigen::half scalar_value)
+      : Tensor(scalar_value, host_scalar_tag{}) {}
+  explicit Tensor(ResourceHandle scalar_value)
+      : Tensor(std::move(scalar_value), host_scalar_tag{}) {}
+
+  // NOTE: The `const char*` host-scalar constructor is provided as a
+  // convenience because otherwise passing a string literal would surprisingly
+  // construct a DT_BOOL tensor.
+  explicit Tensor(const char* scalar_value)
+      : Tensor(string(scalar_value), host_scalar_tag{}) {}
+
   /// Copy constructor.
   Tensor(const Tensor& other);
 
@@ -799,6 +870,46 @@ inline Tensor::Tensor(Tensor&& other)
   other.buf_ = nullptr;
 }
 
+class Tensor::HostScalarTensorBufferBase : public TensorBuffer {
+ public:
+  void FillAllocationDescription(AllocationDescription* proto) const final;
+};
+
+// `Tensor::HostScalarTensorBuffer<T>` is a specialized `TensorBuffer`
+// implementation for storing a single scalar value.
+//
+// TODO(mrry): Evaluate other compilers or approaches to aligning the value
+// so that it can be used directly as a tensor value. For example, in a C++17
+// future, we could use `alignas(EIGEN_MAX_ALIGN_BYTES)` to store the value
+// inline in this object to save an allocation. However, this is not currently
+// widely supported in our compilers.
+template <typename T>
+class Tensor::HostScalarTensorBuffer : public HostScalarTensorBufferBase {
+ public:
+  HostScalarTensorBuffer(T&& value)
+      : data_(reinterpret_cast<T*>(cpu_allocator()->AllocateRaw(
+            EIGEN_MAX_ALIGN_BYTES, sizeof(value)))) {
+    if (is_simple_type<T>::value) {
+      *data_ = value;
+    } else {
+      new (data_) T(std::move(value));
+    }
+  }
+  ~HostScalarTensorBuffer() { cpu_allocator()->Deallocate(data_, 1); }
+  void* data() const final { return const_cast<T*>(data_); }
+  size_t size() const final { return sizeof(*data_); }
+  TensorBuffer* root_buffer() final { return this; }
+
+ private:
+  T* const data_;
+};
+
+template <typename T>
+Tensor::Tensor(T value, host_scalar_tag tag)
+    : buf_(new HostScalarTensorBuffer<T>(std::move(value))) {
+  set_dtype(DataTypeToEnum<T>::value);
+}
+
 inline Tensor& Tensor::operator=(Tensor&& other) {
   // Avoid self-assignment, since we might destroy our underlying buffer.
   if (&other != this) {
diff --git a/tensorflow/core/framework/tensor_test.cc b/tensorflow/core/framework/tensor_test.cc
index c5966041435..925ebc49454 100644
--- a/tensorflow/core/framework/tensor_test.cc
+++ b/tensorflow/core/framework/tensor_test.cc
@@ -830,6 +830,42 @@ TEST(Tensor_Scalar, Basics) {
   }
 }
 
+TEST(Tensor_HostScalar, Basics) {
+  {
+    Tensor t(true);
+    EXPECT_EQ(DT_BOOL, t.dtype());
+    EXPECT_EQ(1, t.NumElements());
+    auto Tt = t.scalar<bool>();
+    EXPECT_EQ(1, Tt.size());
+    EXPECT_EQ(0, Tt.rank());
+    EXPECT_TRUE(Tt());
+    Tt() = false;
+    EXPECT_FALSE(Tt());
+  }
+  {
+    Tensor t(123.45f);
+    EXPECT_EQ(DT_FLOAT, t.dtype());
+    EXPECT_EQ(1, t.NumElements());
+    auto Tt = t.scalar<float>();
+    EXPECT_EQ(1, Tt.size());
+    EXPECT_EQ(0, Tt.rank());
+    EXPECT_FLOAT_EQ(123.45f, Tt());
+    Tt() = 42.0f;
+    EXPECT_FLOAT_EQ(42.0f, Tt());
+  }
+  {
+    Tensor t("foo");
+    EXPECT_EQ(DT_STRING, t.dtype());
+    EXPECT_EQ(1, t.NumElements());
+    auto Tt = t.scalar<string>();
+    EXPECT_EQ(1, Tt.size());
+    EXPECT_EQ(0, Tt.rank());
+    EXPECT_EQ("foo", Tt());
+    Tt() = "bar";
+    EXPECT_EQ("bar", Tt());
+  }
+}
+
 TEST(Tensor_Float, Reshape_And_Slice_Assignment) {
   // A test to experiment with a way to assign to a subset of a tensor
   Tensor t(DT_FLOAT, TensorShape({10, 4, 3, 2}));
diff --git a/tensorflow/core/framework/types.h b/tensorflow/core/framework/types.h
index a05dea19ec4..c0df1933421 100644
--- a/tensorflow/core/framework/types.h
+++ b/tensorflow/core/framework/types.h
@@ -104,6 +104,8 @@ typedef gtl::InlinedVector<DataType, 4> DataTypeVector;
 typedef gtl::ArraySlice<DataType> DataTypeSlice;
 
 typedef gtl::InlinedVector<DeviceType, 4> DeviceTypeVector;
+typedef gtl::InlinedVector<std::pair<DeviceType, int32>, 4>
+    PrioritizedDeviceTypeVector;
 
 // Convert the enums to strings for errors:
 string DataTypeString(DataType dtype);
diff --git a/tensorflow/core/graph/graph.h b/tensorflow/core/graph/graph.h
index 585afa2d008..6a224ca4a23 100644
--- a/tensorflow/core/graph/graph.h
+++ b/tensorflow/core/graph/graph.h
@@ -425,9 +425,9 @@ class Graph {
   // Constructs a graph with a single SOURCE (always id kSourceId) and a
   // single SINK (always id kSinkId) node, and an edge from SOURCE->SINK.
   //
-  // The graph can hold ops found in registry. `registry`s lifetime must be at
+  // The graph can hold ops found in the registry. `ops`s lifetime must be at
   // least that of the constructed graph's.
-  explicit Graph(const OpRegistryInterface* registry);
+  explicit Graph(const OpRegistryInterface* ops);
 
   // Constructs a graph with a single SOURCE (always id kSourceId) and a
   // single SINK (always id kSinkId) node, and an edge from SOURCE->SINK.
diff --git a/tensorflow/core/graph/mkl_graph_util.h b/tensorflow/core/graph/mkl_graph_util.h
index bab1df87a4d..990b2fe9b04 100644
--- a/tensorflow/core/graph/mkl_graph_util.h
+++ b/tensorflow/core/graph/mkl_graph_util.h
@@ -75,6 +75,8 @@ int inline GetTensorMetaDataIndex(int n, int total_tensors) {
 namespace mkl_op_registry {
 static const char* kMklOpLabel = "MklOp";
 static const char* kMklOpLabelPattern = "label='MklOp'";
+static const char* kMklQuantizedOpLabel = "QuantizedMklOp";
+static const char* kMklQuantizedOpLabelPattern = "label='QuantizedMklOp'";
 // Prefix that we add to Tensorflow op name to construct Mkl op name.
 static const char* const kMklOpPrefix = "_Mkl";
 
@@ -91,9 +93,30 @@ inline string GetMklOpName(const string& name) {
 // @return: true if opname is registered as Mkl op; false otherwise
 static inline bool IsMklOp(const string& op_name, DataType T) {
   string kernel = KernelsRegisteredForOp(op_name);
-  bool result =
-      kernel.find(kMklOpLabelPattern) != string::npos && (T == DT_FLOAT);
-  return result;
+
+  // Restrict quantized ops to QUINT8 and QINT8 for now
+  if (kernel.find(kMklQuantizedOpLabelPattern) != string::npos) {
+    return (T == DT_QUINT8 || T == DT_QINT8);
+  }
+  // Restrict regular ops to FLOAT
+  if (kernel.find(kMklOpLabelPattern) != string::npos) {
+    return (T == DT_FLOAT);
+  }
+  return false;
+}
+
+// TODO(mdfaijul): QuantizedConv2D is registered with input: QUINT8
+// filter:QINT8 for mkldnn integration. First a dummy kernel is created
+// and then it is replaced by an actual kernel.
+static inline bool IsMklOp(const string& op_name, DataType Tinput,
+                           DataType Tfilter) {
+  string kernel = KernelsRegisteredForOp(op_name);
+
+  // Restrict quantized ops to QUINT8 and QINT8 for now
+  if (kernel.find(kMklQuantizedOpLabelPattern) != string::npos) {
+    return (Tinput == DT_QUINT8 && Tfilter == DT_QINT8);
+  }
+  return false;
 }
 
 // Check whether opname with type T is registered as MKL-compliant and
diff --git a/tensorflow/core/grappler/BUILD b/tensorflow/core/grappler/BUILD
index 3eb7101adfa..7b03ec38bf5 100644
--- a/tensorflow/core/grappler/BUILD
+++ b/tensorflow/core/grappler/BUILD
@@ -23,6 +23,7 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         "//tensorflow/core:framework",
+        "//tensorflow/core:graph",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
@@ -67,11 +68,14 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         ":utils",
+        "//tensorflow/core:graph",
         "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/hash",
+        "@com_google_absl//absl/strings",
     ],
 )
 
diff --git a/tensorflow/core/grappler/costs/op_level_cost_estimator.cc b/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
index 76e5c989fca..0e552092385 100644
--- a/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
+++ b/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
@@ -311,8 +311,8 @@ OpLevelCostEstimator::OpLevelCostEstimator() {
       {"Square", EIGEN_COST(scalar_square_op<float>)},
       {"Tanh", EIGEN_COST(scalar_tanh_op<float>)},
       {"Relu", EIGEN_COST(scalar_max_op<float>)},
-      {"Sigmoid", EIGEN_COST(scalar_sigmoid_op<float>)},
-      {"QuantizedSigmoid", EIGEN_COST(scalar_sigmoid_op<float>)},
+      {"Sigmoid", EIGEN_COST(scalar_logistic_op<float>)},
+      {"QuantizedSigmoid", EIGEN_COST(scalar_logistic_op<float>)},
       {"Sign", EIGEN_COST(scalar_sign_op<float>)},
       {"Sin", EIGEN_COST(scalar_sin_op<float>)},
       {"Tan", EIGEN_COST(scalar_tan_op<float>)},
diff --git a/tensorflow/core/grappler/graph_view.cc b/tensorflow/core/grappler/graph_view.cc
index 9b3958b6c17..35675fb1a26 100644
--- a/tensorflow/core/grappler/graph_view.cc
+++ b/tensorflow/core/grappler/graph_view.cc
@@ -63,5 +63,26 @@ int OpInputPortIdToArgId(const NodeDef& node, const OpDef& op, int port_id) {
   return OpPortIdToArgId(node, op.input_arg(), port_id);
 }
 
+bool HasSingleFanoutNode(const GraphView& graph_view, const NodeDef* node,
+                         int port) {
+  const auto output = GraphView::OutputPort(node, port);
+  const auto fanout = graph_view.GetFanout(output);
+  return fanout.size() <= 1;
+}
+
+bool NoControlFanin(const GraphView& graph_view, const NodeDef* node) {
+  const auto control_port = GraphView::InputPort(node, -1);
+  return graph_view.GetFanin(control_port).empty();
+}
+
+bool NoControlFanout(const GraphView& graph_view, const NodeDef* node) {
+  const auto control_port = GraphView::OutputPort(node, -1);
+  return graph_view.GetFanout(control_port).empty();
+}
+
+bool NoControlFaninOrFanout(const GraphView& graph_view, const NodeDef* node) {
+  return NoControlFanin(graph_view, node) && NoControlFanout(graph_view, node);
+}
+
 }  // end namespace grappler
 }  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/graph_view.h b/tensorflow/core/grappler/graph_view.h
index 77f4ec730a3..89cec2eb2ec 100644
--- a/tensorflow/core/grappler/graph_view.h
+++ b/tensorflow/core/grappler/graph_view.h
@@ -342,6 +342,14 @@ class GraphView
   }
 };
 
+// Returns true if node has one (or zero) fanout nodes at given port.
+bool HasSingleFanoutNode(const GraphView& graph_view, const NodeDef* node,
+                         int port = 0);
+
+bool NoControlFanin(const GraphView& graph_view, const NodeDef* node);
+bool NoControlFanout(const GraphView& graph_view, const NodeDef* node);
+bool NoControlFaninOrFanout(const GraphView& graph_view, const NodeDef* node);
+
 }  // end namespace grappler
 }  // end namespace tensorflow
 
diff --git a/tensorflow/core/grappler/op_types.cc b/tensorflow/core/grappler/op_types.cc
index edcf6adb938..ebc4e9c4662 100644
--- a/tensorflow/core/grappler/op_types.cc
+++ b/tensorflow/core/grappler/op_types.cc
@@ -214,6 +214,11 @@ bool IsFloorDiv(const NodeDef& node) { return node.op() == "FloorDiv"; }
 
 bool IsFloorMod(const NodeDef& node) { return node.op() == "FloorMod"; }
 
+bool IsFusedBatchNorm(const NodeDef& node) {
+  const auto& op = node.op();
+  return op == "FusedBatchNorm" || op == "FusedBatchNormV2";
+}
+
 bool IsFusedBatchNormGrad(const NodeDef& node) {
   const auto& op = node.op();
   return op == "FusedBatchNormGrad" || op == "FusedBatchNormGradV2";
@@ -359,6 +364,8 @@ bool IsReduction(const NodeDef& node) {
          op == "Mean" || op == "Any" || op == "All";
 }
 
+bool IsRelu(const NodeDef& node) { return node.op() == "Relu"; }
+
 bool IsReluGrad(const NodeDef& node) { return node.op() == "ReluGrad"; }
 
 bool IsRelu6Grad(const NodeDef& node) { return node.op() == "Relu6Grad"; }
diff --git a/tensorflow/core/grappler/op_types.h b/tensorflow/core/grappler/op_types.h
index 29f989b5bed..067d4e774f4 100644
--- a/tensorflow/core/grappler/op_types.h
+++ b/tensorflow/core/grappler/op_types.h
@@ -66,6 +66,7 @@ bool IsFakeParam(const NodeDef& node);
 bool IsFill(const NodeDef& node);
 bool IsFloorDiv(const NodeDef& node);
 bool IsFloorMod(const NodeDef& node);
+bool IsFusedBatchNorm(const NodeDef& node);
 bool IsFusedBatchNormGrad(const NodeDef& node);
 bool IsGreater(const NodeDef& node);
 bool IsGreaterEqual(const NodeDef& node);
@@ -113,6 +114,7 @@ bool IsRandomShuffle(const NodeDef& node);
 bool IsRank(const NodeDef& node);
 bool IsReal(const NodeDef& node);
 bool IsRealDiv(const NodeDef& node);
+bool IsRelu(const NodeDef& node);
 bool IsRelu6Grad(const NodeDef& node);
 bool IsReluGrad(const NodeDef& node);
 bool IsReciprocalGrad(const NodeDef& node);
diff --git a/tensorflow/core/grappler/optimizers/BUILD b/tensorflow/core/grappler/optimizers/BUILD
index 5c05ae2a4fe..3a5b1334d3f 100644
--- a/tensorflow/core/grappler/optimizers/BUILD
+++ b/tensorflow/core/grappler/optimizers/BUILD
@@ -213,6 +213,7 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         "//tensorflow/core:graph",
+        "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:utils",
diff --git a/tensorflow/core/grappler/optimizers/constant_folding.cc b/tensorflow/core/grappler/optimizers/constant_folding.cc
index 24b02040692..5e3e5d6af9a 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding.cc
+++ b/tensorflow/core/grappler/optimizers/constant_folding.cc
@@ -157,6 +157,16 @@ bool GetConcatAxis(const GraphProperties& properties, NodeDef* node,
   return true;
 }
 
+bool HasTPUAttributes(const NodeDef& node) {
+  AttrSlice attrs(node);
+  for (auto attr : attrs) {
+    if (attr.first.find("_tpu_") != attr.first.npos) {
+      return true;
+    }
+  }
+  return false;
+}
+
 }  // namespace
 
 ConstantFolding::ConstantFolding(RewriterConfig::Toggle opt_level,
@@ -764,6 +774,13 @@ bool ConstantFolding::IsFoldable(const NodeDef& node) const {
     return false;
   }
 
+  // Don't fold nodes that contain TPU attributes.
+  // TODO(rmlarsen): We should be able to fold many of these nodes as long as we
+  // properly forward custom attributes, b/119051778.
+  if (HasTPUAttributes(node)) {
+    return false;
+  }
+
   const OpDef* op_def = nullptr;
   Status status = OpRegistry::Global()->LookUpOpDef(node.op(), &op_def);
   if (!status.ok()) {
@@ -1128,9 +1145,12 @@ Status ConstantFolding::FoldNode(NodeDef* node, GraphDef* output_graph,
   std::vector<NodeDef> const_nodes;
   TF_RETURN_IF_ERROR(
       EvaluateOneFoldable(*node, &const_nodes, result_too_large));
+  VLOG(1) << "Folded node:\n" << node->DebugString();
+
   NodeDef* constant_output = nullptr;
   for (int i = 0; i < const_nodes.size(); i++) {
     NodeDef* const_node = &const_nodes[i];
+    VLOG(1) << "Generated constant node:\n" << const_node->DebugString();
     if (const_node->name().empty()) {
       // Dead output: we can't create a constant to encode its value, so we'll
       // just skip it. We'll preserve the edges that originate from that
@@ -1296,64 +1316,6 @@ Status ConstantFolding::FoldGraph(
   return Status::OK();
 }
 
-// Returns true iff this reduction can be reduced to an identity (i.e if the set
-// of dimensions to reduce along is empty). This happens often in the gradient
-// graphs.
-bool ConstantFolding::IsSimplifiableReduction(
-    const NodeDef& node, const GraphProperties& properties) const {
-  if (IsReduction(node)) {
-    CHECK_LE(2, node.input_size());
-    const NodeDef* reductions_indices = node_map_->GetNode(node.input(1));
-    if (IsReallyConstant(*reductions_indices)) {
-      TensorVector output;
-      auto outputs_cleanup = gtl::MakeCleanup([&output] {
-        for (const auto& out : output) {
-          delete out.tensor;
-        }
-      });
-      Status s = EvaluateNode(*reductions_indices, TensorVector(), &output);
-      if (!s.ok()) {
-        return false;
-      }
-      CHECK_EQ(1, output.size());
-      int output_size = output[0]->NumElements();
-      if (output_size == 0) {
-        return true;
-      }
-      if (node.attr().count("keep_dims") > 0 &&
-          node.attr().at("keep_dims").b()) {
-        const auto& props = properties.GetInputProperties(node.name());
-        if (!props.empty()) {
-          const TensorShapeProto& input_shape = props[0].shape();
-          if (!input_shape.unknown_rank()) {
-            bool simplifiable = true;
-            for (int i = 0; i < output[0]->NumElements(); ++i) {
-              int64 dim;
-              if (output[0]->dtype() == DT_INT32) {
-                dim = output[0]->flat<int32>()(i);
-              } else {
-                dim = output[0]->flat<int64>()(i);
-              }
-              if (dim < 0) {
-                dim += input_shape.dim_size();
-              }
-              if (dim < 0 || dim >= input_shape.dim_size() ||
-                  input_shape.dim(dim).size() != 1) {
-                simplifiable = false;
-                break;
-              }
-            }
-            if (simplifiable) {
-              return true;
-            }
-          }
-        }
-      }
-    }
-  }
-  return false;
-}
-
 bool ConstantFolding::IsSimplifiableReshape(
     const NodeDef& node, const GraphProperties& properties) const {
   if (!IsReshape(node)) {
@@ -1595,15 +1557,19 @@ Status ConstantFolding::ReplaceOperationWithConstant(
 
 Status ConstantFolding::SimplifyGraph(
     bool use_shape_info, GraphDef* optimized_graph, GraphProperties* properties,
-    const absl::flat_hash_set<string>& nodes_to_not_simplify) {
+    absl::flat_hash_set<string>* nodes_to_not_simplify) {
   for (int i = 0; i < optimized_graph->node_size(); ++i) {
+    NodeDef* node = optimized_graph->mutable_node(i);
     // TODO(lyandy): Move nodes to not simplify check into SimplifyNode and
     // generalize to only restrict certain simplifications.
-    if (nodes_to_not_simplify.find(optimized_graph->node(i).name()) ==
-        nodes_to_not_simplify.end()) {
-      TF_RETURN_IF_ERROR(SimplifyNode(use_shape_info,
-                                      optimized_graph->mutable_node(i),
-                                      optimized_graph, properties));
+    if (nodes_to_not_simplify->find(node->name()) ==
+        nodes_to_not_simplify->end()) {
+      if (HasTPUAttributes(optimized_graph->node(i))) {
+        nodes_to_not_simplify->insert(node->name());
+        continue;
+      }
+      TF_RETURN_IF_ERROR(
+          SimplifyNode(use_shape_info, node, optimized_graph, properties));
     }
   }
   return Status::OK();
@@ -1699,7 +1665,7 @@ Status ConstantFolding::SimplifyNode(bool use_shape_info, NodeDef* node,
     return Status::OK();
   }
 
-  if (SimplifyReduction(*properties, node)) {
+  if (SimplifyReduction(optimized_graph, *properties, node)) {
     graph_modified_ = true;
     return Status::OK();
   }
@@ -2302,9 +2268,148 @@ bool ConstantFolding::SimplifySwitch(GraphDef* optimized_graph, NodeDef* node) {
   return false;
 }
 
-bool ConstantFolding::SimplifyReduction(const GraphProperties& properties,
+bool ConstantFolding::IsReductionCandidateForSimplification(
+    const NodeDef& node, const GraphProperties& properties,
+    TensorShapeProto* input_tensor_shape, TensorShapeProto* output_tensor_shape,
+    bool* is_single_element_op) const {
+  // Ensure its an appropriate Reduce node.
+  if (!IsReduction(node) || node.input_size() < 2) {
+    return false;
+  }
+  // Ensure that the axes to reduce by are constant.
+  NodeDef* reductions_indices = node_map_->GetNode(node.input(1));
+  if (!IsReallyConstant(*reductions_indices)) {
+    return false;
+  }
+
+  // Get the properties of the input & output tensors and check if they both
+  // contain a single element.
+  if (!properties.HasInputProperties(node.name()) ||
+      !properties.HasOutputProperties(node.name())) {
+    return false;
+  }
+  const auto& input_props = properties.GetInputProperties(node.name())[0];
+  const auto& output_props = properties.GetOutputProperties(node.name())[0];
+  if (!input_props.has_shape() || input_props.shape().unknown_rank() ||
+      !output_props.has_shape() || output_props.shape().unknown_rank()) {
+    return false;
+  }
+  *input_tensor_shape = input_props.shape();
+  *output_tensor_shape = output_props.shape();
+  for (int i = 0; i < input_tensor_shape->dim_size(); ++i) {
+    if (input_tensor_shape->dim(i).size() < 0) {
+      return false;
+    }
+  }
+  for (int i = 0; i < output_tensor_shape->dim_size(); ++i) {
+    if (output_tensor_shape->dim(i).size() < 0) {
+      return false;
+    }
+  }
+  const int input_num_elements =
+      TensorShape(*input_tensor_shape).num_elements();
+  const int output_num_elements =
+      TensorShape(*output_tensor_shape).num_elements();
+  *is_single_element_op = input_num_elements == 1 && output_num_elements == 1;
+
+  return true;
+}
+
+bool ConstantFolding::IsReductionSimplifiableToIdentity(
+    const NodeDef& node, const TensorShapeProto& input_shape, bool keep_dims,
+    const TensorVector& reduction_indices_vector) const {
+  int output_size = reduction_indices_vector[0]->NumElements();
+  if (output_size == 0) {
+    return true;
+  }
+
+  if (!keep_dims) {
+    return false;
+  }
+  bool simplifiable = true;
+  for (int i = 0; i < output_size; ++i) {
+    int64 dim;
+    if (reduction_indices_vector[0]->dtype() == DT_INT32) {
+      dim = reduction_indices_vector[0]->flat<int32>()(i);
+    } else {
+      dim = reduction_indices_vector[0]->flat<int64>()(i);
+    }
+    if (dim < 0) {
+      dim += input_shape.dim_size();
+    }
+    if (dim < 0 || dim >= input_shape.dim_size() ||
+        input_shape.dim(dim).size() != 1) {
+      simplifiable = false;
+      break;
+    }
+  }
+  return simplifiable;
+}
+
+bool ConstantFolding::SimplifyReduction(GraphDef* optimized_graph,
+                                        const GraphProperties& properties,
                                         NodeDef* node) {
-  if (IsSimplifiableReduction(*node, properties)) {
+  bool is_single_element_op = false;
+  TensorShapeProto input_tensor_shape, output_tensor_shape;
+  if (!IsReductionCandidateForSimplification(
+          *node, properties, &input_tensor_shape, &output_tensor_shape,
+          &is_single_element_op)) {
+    return false;
+  }
+
+  // Get the reduction indices.
+  string reduction_indices_input = node->input(1);
+  NodeDef* reduction_indices = node_map_->GetNode(reduction_indices_input);
+  TensorVector reduction_indices_vector;
+  auto outputs_cleanup = gtl::MakeCleanup([&reduction_indices_vector] {
+    for (const auto& out : reduction_indices_vector) {
+      delete out.tensor;
+    }
+  });
+  if (!EvaluateNode(*reduction_indices, TensorVector(),
+                    &reduction_indices_vector)
+           .ok() ||
+      reduction_indices_vector.size() != 1) {
+    return false;
+  }
+
+  bool keep_dims =
+      node->attr().count("keep_dims") > 0 && node->attr().at("keep_dims").b();
+  bool simplifiable_to_reshape =
+      is_single_element_op && !keep_dims && (node->attr().count("T") > 0);
+  bool simplifiable_to_identity = IsReductionSimplifiableToIdentity(
+      *node, input_tensor_shape, keep_dims, reduction_indices_vector);
+
+  if (simplifiable_to_reshape) {
+    // Const node to output shape.
+    const int new_num_dimensions = output_tensor_shape.dim_size();
+    Tensor tensor(DT_INT32, TensorShape({new_num_dimensions}));
+    for (int i = 0; i < new_num_dimensions; i++) {
+      tensor.flat<int>()(i) = 1;
+    }
+    TensorValue shape_value(&tensor);
+    NodeDef* shape_node = optimized_graph->add_node();
+    if (!CreateNodeDef(OptimizedNodeName(*node, "_shape_const"), shape_value,
+                       shape_node)
+             .ok()) {
+      return false;
+    }
+    shape_node->set_device(node->device());
+    node_map_->AddNode(shape_node->name(), shape_node);
+    // Control dependency to ensure shape_node is in the correct frame.
+    shape_node->add_input(AsControlDependency(reduction_indices_input));
+    node_map_->AddOutput(NodeName(reduction_indices_input), shape_node->name());
+    // Optimize node to Reshape.
+    node->set_op("Reshape");
+    node_map_->UpdateInput(node->name(), node->input(1), shape_node->name());
+    node->set_input(1, shape_node->name());
+    node->mutable_attr()->erase("keep_dims");
+    node->mutable_attr()->erase("Tidx");
+    AttrValue attr_type_indices;
+    attr_type_indices.set_type(DT_INT32);
+    (*node->mutable_attr())["Tshape"] = attr_type_indices;
+    return true;
+  } else if (simplifiable_to_identity) {
     // Replace the reduction node with an identity node, that can be further
     // optimized by the model pruner.
     DataType output_type;
@@ -3042,7 +3147,7 @@ Status ConstantFolding::RunOptimizationPass(Cluster* cluster,
   TF_RETURN_IF_ERROR(FoldGraph(optimized_graph, &nodes_to_not_simplify));
   node_map_.reset(new NodeMap(optimized_graph));
   TF_RETURN_IF_ERROR(SimplifyGraph(can_use_shape_info, optimized_graph,
-                                   &properties, nodes_to_not_simplify));
+                                   &properties, &nodes_to_not_simplify));
 
   return Status::OK();
 }
diff --git a/tensorflow/core/grappler/optimizers/constant_folding.h b/tensorflow/core/grappler/optimizers/constant_folding.h
index c81d3067d50..0b778882d7d 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding.h
+++ b/tensorflow/core/grappler/optimizers/constant_folding.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/core/framework/device_base.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/grappler/costs/graph_properties.h"
 #include "tensorflow/core/grappler/optimizers/graph_optimizer.h"
 #include "tensorflow/core/grappler/utils.h"
@@ -96,14 +97,11 @@ class ConstantFolding : public GraphOptimizer {
   Status FoldGraph(GraphDef* output,
                    absl::flat_hash_set<string>* nodes_to_not_simplify);
 
-  bool IsSimplifiableReduction(const NodeDef& node,
-                               const GraphProperties& properties) const;
   bool IsSimplifiableReshape(const NodeDef& node,
                              const GraphProperties& properties) const;
-  Status SimplifyGraph(
-      bool use_shape_info, GraphDef* optimized_graph,
-      GraphProperties* properties,
-      const absl::flat_hash_set<string>& nodes_to_not_simplify);
+  Status SimplifyGraph(bool use_shape_info, GraphDef* optimized_graph,
+                       GraphProperties* properties,
+                       absl::flat_hash_set<string>* nodes_to_not_simplify);
   Status SimplifyNode(bool use_shape_info, NodeDef* node,
                       GraphDef* optimized_graph, GraphProperties* properties);
 
@@ -148,8 +146,22 @@ class ConstantFolding : public GraphOptimizer {
   bool SimplifyReshape(const GraphProperties& properties, bool use_shape_info,
                        NodeDef* node);
 
-  // Simplifies a Reduction operation to an Identity operation if applicable.
-  bool SimplifyReduction(const GraphProperties& properties, NodeDef* node);
+  // Returns true if theres a possibility that a Reduce node could be simplified
+  // to an Identity/Reshape.
+  bool IsReductionCandidateForSimplification(
+      const NodeDef& node, const GraphProperties& properties,
+      TensorShapeProto* input_tensor_shape,
+      TensorShapeProto* output_tensor_shape, bool* is_single_element_op) const;
+  // Returns true iff this reduction can be reduced to an identity (i.e if the
+  // set of dimensions to reduce along is empty). This happens often in the
+  // gradient graphs.
+  bool IsReductionSimplifiableToIdentity(
+      const NodeDef& node, const TensorShapeProto& input_shape, bool keep_dims,
+      const gtl::InlinedVector<TensorValue, 4>& reduction_indices_vector) const;
+  // Simplifies a Reduction operation to an Identity/Reshape operation if
+  // applicable.
+  bool SimplifyReduction(GraphDef* optimized_graph,
+                         const GraphProperties& properties, NodeDef* node);
 
   // Switch(x, x) will always feed false to its false branch and true to
   // its true branch. By rewriting the graph a bit, we can propagate these
diff --git a/tensorflow/core/grappler/optimizers/constant_folding_test.cc b/tensorflow/core/grappler/optimizers/constant_folding_test.cc
index 02c45e80c5f..f6fdb32e989 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding_test.cc
+++ b/tensorflow/core/grappler/optimizers/constant_folding_test.cc
@@ -2303,6 +2303,95 @@ TEST_F(ConstantFoldingTest, NoOpReduction) {
   test::ExpectTensorNear<float>(tensors_expected[1], tensors[1], 1e-5);
 }
 
+TEST_F(ConstantFoldingTest, SingleElementEmptyAxisReduction) {
+  // Build a simple graph with reductions that involve single-element input and
+  // no axes to reduce along.
+  tensorflow::Scope scope = tensorflow::Scope::NewRootScope();
+
+  Output input_var_three_dim = ops::Variable(
+      scope.WithOpName("input_var_three_dim"), {1, 1, 1}, DT_FLOAT);
+  Output input_var_one_dim =
+      ops::Variable(scope.WithOpName("input_var_one_dim"), {1}, DT_FLOAT);
+  Output one_axis = ops::Const(scope.WithOpName("one_axis"), {0}, {1});
+  Output multiple_axes =
+      ops::Const(scope.WithOpName("multiple_axes"), {1, 0}, {2});
+  Output variable_axis =
+      ops::Variable(scope.WithOpName("input_var_axis"), {1}, DT_INT32);
+  ops::Mean::Attrs attr;
+  attr = attr.KeepDims(false);
+  // Should be optimized to Reshape.
+  Output mean_1 = ops::Mean(scope.WithOpName("mean_1"), input_var_three_dim,
+                            one_axis, attr.KeepDims(false));
+  Output mean_2 = ops::Mean(scope.WithOpName("mean_2"), input_var_three_dim,
+                            multiple_axes, attr.KeepDims(false));
+  // Should remain as-is, since OutputProperties will not be known this node.
+  Output mean_3 = ops::Mean(scope.WithOpName("mean_3"), input_var_one_dim,
+                            one_axis, attr.KeepDims(false));
+  // Should remain as-is.
+  Output mean_4 = ops::Mean(scope.WithOpName("mean_4"), input_var_three_dim,
+                            variable_axis, attr.KeepDims(false));
+  // Should be optimized to Identity, since KeepDims=true.
+  Output mean_5 = ops::Mean(scope.WithOpName("mean_5"), input_var_three_dim,
+                            multiple_axes, attr.KeepDims(true));
+
+  GrapplerItem item;
+  item.fetch = {"mean_1", "mean_2", "mean_3", "mean_4", "mean_5"};
+  TF_CHECK_OK(scope.ToGraphDef(&item.graph));
+
+  ConstantFolding optimizer(nullptr /* cpu_device */);
+  GraphDef output;
+  Status status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+
+  // Ensure Mean node is optimized to Reshape.
+  int found = 0;
+  for (const auto& node : output.node()) {
+    if (node.name() == "mean_1" || node.name() == "mean_2") {
+      found++;
+      EXPECT_EQ("Reshape", node.op());
+      EXPECT_EQ(2, node.input_size());
+      EXPECT_EQ("input_var_three_dim", node.input(0));
+    } else if (node.name() == "mean_3") {
+      found++;
+      EXPECT_EQ("Mean", node.op());
+      EXPECT_EQ(2, node.input_size());
+      EXPECT_EQ("input_var_one_dim", node.input(0));
+    } else if (node.name() == "mean_4") {
+      found++;
+      EXPECT_EQ("Mean", node.op());
+      EXPECT_EQ(2, node.input_size());
+      EXPECT_EQ("input_var_three_dim", node.input(0));
+    } else if (node.name() == "mean_5") {
+      found++;
+      EXPECT_EQ("Identity", node.op());
+      EXPECT_EQ(2, node.input_size());
+      EXPECT_EQ("^multiple_axes", node.input(1));
+    }
+  }
+  EXPECT_EQ(5, found);
+
+  // Ensure resultant values from Mean and Reshape are the same.
+  auto input_var_three_dim_t =
+      GenerateRandomTensor<DT_FLOAT>(TensorShape({1, 1, 1}));
+  auto input_var_one_dim_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({1}));
+  Tensor input_var_axis_t(DT_INT32, TensorShape({1}));
+  input_var_axis_t.flat<int32>()(0) = 0;
+  auto tensors_expected =
+      EvaluateNodes(item.graph, item.fetch,
+                    {{"input_var_three_dim", input_var_three_dim_t},
+                     {"input_var_one_dim", input_var_one_dim_t},
+                     {"input_var_axis", input_var_axis_t}});
+  EXPECT_EQ(5, tensors_expected.size());
+  auto tensors = EvaluateNodes(output, item.fetch,
+                               {{"input_var_three_dim", input_var_three_dim_t},
+                                {"input_var_one_dim", input_var_one_dim_t},
+                                {"input_var_axis", input_var_axis_t}});
+  EXPECT_EQ(5, tensors.size());
+  for (int i = 0; i < 5; ++i) {
+    test::ExpectTensorNear<float>(tensors_expected[i], tensors[i], 1e-5);
+  }
+}
+
 TEST_F(ConstantFoldingTest, NoOpReshape) {
   // Build a simple graph with a reshape that can be reduced to the identity.
   tensorflow::Scope scope = tensorflow::Scope::NewRootScope();
diff --git a/tensorflow/core/grappler/optimizers/data/map_and_filter_fusion.cc b/tensorflow/core/grappler/optimizers/data/map_and_filter_fusion.cc
index 750cd58869c..2b0a347ce62 100644
--- a/tensorflow/core/grappler/optimizers/data/map_and_filter_fusion.cc
+++ b/tensorflow/core/grappler/optimizers/data/map_and_filter_fusion.cc
@@ -38,21 +38,28 @@ NodeDef MakeFusedNode(const NodeDef& map_node,
                       MutableGraphView* graph) {
   NodeDef fused_node;
   graph_utils::SetUniqueGraphNodeName("fused_map", graph->graph(), &fused_node);
-  fused_node.set_op("MapDataset");
-  fused_node.add_input(map_node.input(0));
+  fused_node.set_op(map_node.op());
+
+  // Copy over inputs.
+  for (int i = 0; i < map_node.input_size(); ++i) {
+    fused_node.add_input(map_node.input(i));
+  }
 
   auto attr = map_node.attr().at("f");
   attr.mutable_func()->set_name(fused_function.signature().name());
   (*fused_node.mutable_attr())["f"] = std::move(attr);
 
-  graph_utils::CopyAttribute("Targuments", map_node, &fused_node);
-
-  for (auto key : {"output_shapes", "output_types"})
+  // Required attrs.
+  for (auto key : {"Targuments", "output_shapes", "output_types"}) {
     graph_utils::CopyAttribute(key, map_node, &fused_node);
+  }
 
-  if (const auto* attr =
-          gtl::FindOrNull(map_node.attr(), "use_inter_op_parallelism"))
-    (*fused_node.mutable_attr())["use_inter_op_parallelism"] = *attr;
+  // Optional attrs.
+  for (auto key : {"use_inter_op_parallelism", "sloppy"}) {
+    if (const auto* attr = gtl::FindOrNull(map_node.attr(), key)) {
+      graph_utils::CopyAttribute(key, map_node, &fused_node);
+    }
+  }
 
   // Add the predicate output attributes.
   (*fused_node.mutable_attr())["output_types"]
@@ -97,7 +104,9 @@ Status MapAndFilterFusion::Optimize(Cluster* cluster, const GrapplerItem& item,
   FunctionLibraryDefinition function_library(OpRegistry::Global(),
                                              item.graph.library());
   auto get_map_node = [](const NodeDef& node) -> const NodeDef* {
-    if (node.op() == "MapDataset") return &node;
+    if (node.op() == "MapDataset" || node.op() == "ParallelMapDataset") {
+      return &node;
+    }
     return nullptr;
   };
 
diff --git a/tensorflow/core/grappler/optimizers/data/map_and_filter_fusion_test.cc b/tensorflow/core/grappler/optimizers/data/map_and_filter_fusion_test.cc
index 6e6da37d7c2..c5a5e22aba6 100644
--- a/tensorflow/core/grappler/optimizers/data/map_and_filter_fusion_test.cc
+++ b/tensorflow/core/grappler/optimizers/data/map_and_filter_fusion_test.cc
@@ -30,6 +30,7 @@ namespace grappler {
 namespace {
 using graph_tests_utils::MakeFilterNode;
 using graph_tests_utils::MakeMapNode;
+using graph_tests_utils::MakeParallelMapNode;
 
 TEST(MapAndFilterFusionTest, FuseMapAndFilter) {
   using test::function::NDef;
@@ -58,6 +59,41 @@ TEST(MapAndFilterFusionTest, FuseMapAndFilter) {
       graph_utils::ContainsNodeWithOp("FilterByLastComponentDataset", output));
 }
 
+TEST(MapAndFilterFusionTest, FuseParallelMapAndFilter) {
+  using test::function::NDef;
+  GrapplerItem item;
+  item.graph = test::function::GDef(
+      {NDef("start", "Const", {}, {{"value", 0}, {"dtype", DT_INT32}}),
+       NDef("stop", "Const", {}, {{"value", 10}, {"dtype", DT_INT32}}),
+       NDef("step", "Const", {}, {{"value", 1}, {"dtype", DT_INT32}}),
+       NDef("range", "RangeDataset", {"start", "stop", "step"}, {}),
+       NDef("num_parallel_calls", "Const", {},
+            {{"value", 3}, {"dtype", "DT_INT32"}}),
+       MakeParallelMapNode("map", "range", "num_parallel_calls", "XTimesTwo",
+                           /*sloppy=*/false),
+       MakeFilterNode("filter", "map")},
+      // FunctionLib
+      {
+          test::function::XTimesTwo(),
+          test::function::IsZero(),
+      });
+
+  MapAndFilterFusion optimizer;
+  GraphDef output;
+  TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
+
+  EXPECT_FALSE(graph_utils::ContainsGraphNodeWithName("map", output));
+  EXPECT_FALSE(graph_utils::ContainsGraphNodeWithName("filter", output));
+  EXPECT_TRUE(graph_utils::ContainsNodeWithOp("ParallelMapDataset", output))
+      << output.DebugString();
+  auto& map_node = output.node(
+      graph_utils::FindGraphNodeWithOp("ParallelMapDataset", output));
+  EXPECT_FALSE(map_node.attr().at("sloppy").b()) << map_node.DebugString();
+  EXPECT_TRUE(
+      graph_utils::ContainsNodeWithOp("FilterByLastComponentDataset", output))
+      << output.DebugString();
+}
+
 TEST(MapAndFilterFusionTest, FuseMapAndFilterWithExtraChild) {
   using test::function::NDef;
   GrapplerItem item;
@@ -103,6 +139,56 @@ TEST(MapAndFilterFusionTest, FuseMapAndFilterWithExtraChild) {
   EXPECT_EQ(cache_node.input(0), filter_by_component.name());
 }
 
+TEST(MapAndFilterFusionTest, FuseParallelMapAndFilterWithExtraChild) {
+  using test::function::NDef;
+  GrapplerItem item;
+  item.graph = test::function::GDef(
+      {NDef("start", "Const", {}, {{"value", 0}, {"dtype", DT_INT32}}),
+       NDef("stop", "Const", {}, {{"value", 10}, {"dtype", DT_INT32}}),
+       NDef("step", "Const", {}, {{"value", 1}, {"dtype", DT_INT32}}),
+       NDef("filename", "Const", {}, {{"value", ""}, {"dtype", DT_STRING}}),
+       NDef("range", "RangeDataset", {"start", "stop", "step"}, {}),
+       NDef("num_parallel_calls", "Const", {},
+            {{"value", 3}, {"dtype", "DT_INT32"}}),
+       MakeParallelMapNode("map", "range", "num_parallel_calls", "XTimesTwo",
+                           /*sloppy=*/true),
+       MakeFilterNode("filter", "map"),
+       NDef("cache", "CacheDataset", {"filter", "filename"}, {})},
+      // FunctionLib
+      {
+          test::function::XTimesTwo(),
+          test::function::IsZero(),
+      });
+
+  MapAndFilterFusion optimizer;
+  GraphDef output;
+  TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
+
+  EXPECT_FALSE(graph_utils::ContainsGraphNodeWithName("map", output));
+  EXPECT_FALSE(graph_utils::ContainsGraphNodeWithName("filter", output));
+  ASSERT_TRUE(graph_utils::ContainsNodeWithOp("ParallelMapDataset", output));
+  ASSERT_TRUE(
+      graph_utils::ContainsNodeWithOp("FilterByLastComponentDataset", output));
+  ASSERT_TRUE(graph_utils::ContainsNodeWithOp("CacheDataset", output));
+
+  int map_id = graph_utils::FindGraphNodeWithOp("ParallelMapDataset", output);
+  auto& map_node = output.node(map_id);
+  ASSERT_EQ(map_node.input_size(), 2);
+  EXPECT_EQ(map_node.input(0), "range");
+  EXPECT_EQ(map_node.input(1), "num_parallel_calls");
+
+  int filter_by_component_id =
+      graph_utils::FindGraphNodeWithOp("FilterByLastComponentDataset", output);
+  auto& filter_by_component = output.node(filter_by_component_id);
+  ASSERT_EQ(filter_by_component.input_size(), 1);
+  EXPECT_EQ(filter_by_component.input(0), map_node.name());
+
+  int cache_id = graph_utils::FindGraphNodeWithOp("CacheDataset", output);
+  auto& cache_node = output.node(cache_id);
+  ASSERT_EQ(cache_node.input_size(), 2);
+  EXPECT_EQ(cache_node.input(0), filter_by_component.name());
+}
+
 }  // namespace
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/data/vectorization/BUILD b/tensorflow/core/grappler/optimizers/data/vectorization/BUILD
index 49ba6c2ba9f..541302361fb 100644
--- a/tensorflow/core/grappler/optimizers/data/vectorization/BUILD
+++ b/tensorflow/core/grappler/optimizers/data/vectorization/BUILD
@@ -33,6 +33,7 @@ cc_library(
     deps = [
         ":wrapped_tensor",
         "//tensorflow/core:core_cpu",
+        "//tensorflow/cc:ops",
         "//tensorflow/core:lib",
     ] + tf_protos_all(),
 )
@@ -120,5 +121,6 @@ cc_library(
         ":unpack_vectorizer",
         ":vectorizer",
         ":vectorizer_registry",
+        "@com_google_absl//absl/container:flat_hash_map",
     ],
 )
diff --git a/tensorflow/core/grappler/optimizers/data/vectorization/cwise_op_vectorizer.cc b/tensorflow/core/grappler/optimizers/data/vectorization/cwise_op_vectorizer.cc
index 709882e45ae..9d853f84a8a 100644
--- a/tensorflow/core/grappler/optimizers/data/vectorization/cwise_op_vectorizer.cc
+++ b/tensorflow/core/grappler/optimizers/data/vectorization/cwise_op_vectorizer.cc
@@ -43,87 +43,78 @@ const char* const kExpandDimsPrefix = "vectorized/expanddims/";
 // with shape [n, 12, 7, 5]: we need to manually expand the dimensions of A
 // *after* the leading dimension, i.e. expand A to the shape [n, 1, 1, 5] before
 // broadcasting.
-Status ExpandDimsForBroadcast(std::vector<WrappedTensor>* inputs, Graph* g) {
+Status ExpandDimsForBroadcast(VectorizerInput* inputs, Graph* g) {
   Status status;
   Scope parent = NewInternalScope(g, &status, nullptr);
-  Scope s = parent.NewSubScope(kExpandDimsPrefix);
+  Scope scope = parent.NewSubScope(kExpandDimsPrefix);
 
   // TODO(rachelim): We can potentially get rid of all these ops if shapes are
   // known statically
 
-  Output const_0 = ops::Const(s, 0);
-  Output const_1 = ops::Const(s, 1);
-
-  std::vector<Output> ranks;
-  ranks.reserve(inputs->size());
-
   // Get the stacked rank of each input
-  for (const auto& input : *inputs) {
-    Output rank = ops::Rank(s, Output(input.node, input.output_index));
+  auto get_stacked_rank = [&scope](const WrappedTensor& input) {
+    Output rank = ops::Rank(scope, Output(input.node, input.output_index));
 
     if (!input.stacked) {
       // If the input is unstacked, add 1
-      rank = ops::Add(s, rank, const_1);
+      rank = ops::Add(scope, rank, ops::Const(scope, 1));
     }
 
-    ranks.push_back(rank);
-  }
+    return rank;
+  };
 
-  // Pack the ranks into one tensor to get the max
-  Output packed_ranks = ops::Stack(s, ranks);
+  Output rank_0 = get_stacked_rank(inputs->at(0));
+  Output rank_1 = get_stacked_rank(inputs->at(1));
 
-  Output max_rank =
-      ops::Max(s, packed_ranks, const_0, ops::Max::Attrs().KeepDims(true));
-
-  std::vector<WrappedTensor> expanded_inputs;
-  expanded_inputs.reserve(inputs->size());
+  Output max_rank = ops::Maximum(scope, rank_0, rank_1);
 
   // For all inputs that are stacked, expand dimensions after dim 0.
-  for (size_t i = 0; i < inputs->size(); ++i) {
-    if (!inputs->at(i).stacked) {
-      expanded_inputs.push_back(inputs->at(i));
-      continue;
-    }
+  auto expand_dims_if_unstacked =
+      [&scope, &max_rank](const WrappedTensor& tensor, const Output& rank) {
+        if (!tensor.stacked)
+          return WrappedTensor(tensor.node, tensor.output_index, false);
 
-    Output input(inputs->at(i).node, inputs->at(i).output_index);
+        Output input(tensor.node, tensor.output_index);
 
-    // Number of dimensions to expand
-    Output rank_diff = ops::Sub(s, max_rank, ranks[i]);
+        Output rank_diff = ops::Sub(scope, max_rank, rank);
 
-    // [1] * rank_diff
-    Output ones = ops::Tile(s, ops::Const(s, {1}), rank_diff);
+        // [1] * rank_diff
+        Output ones = ops::Fill(
+            scope, ops::ExpandDims(scope, rank_diff, ops::Const(scope, 0)),
+            ops::Const(scope, 1));
 
-    Output const_vec_1 = ops::Const(s, {1});
+        Output shape = ops::Shape(scope, input);
 
-    Output shape = ops::Shape(s, input);
+        Output const_vec_1 = ops::Const(scope, {1});
+        // shape[:1]
+        Output concat_pre = ops::StridedSlice(
+            scope, shape, const_vec_1, const_vec_1, const_vec_1,
+            ops::StridedSlice::Attrs().BeginMask(1));
 
-    // shape[:1]
-    Output concat_pre =
-        ops::StridedSlice(s, shape, const_vec_1, const_vec_1, const_vec_1,
-                          ops::StridedSlice::Attrs().BeginMask(1));
+        // shape[1:]
+        Output concat_post = ops::StridedSlice(
+            scope, shape, const_vec_1, const_vec_1, const_vec_1,
+            ops::StridedSlice::Attrs().EndMask(1));
 
-    // shape[1:]
-    Output concat_post =
-        ops::StridedSlice(s, shape, const_vec_1, const_vec_1, const_vec_1,
-                          ops::StridedSlice::Attrs().EndMask(1));
+        // tf.concat([shape[:1], ones, shape[1:]], 0)
+        Output new_shape = ops::Concat(scope, {concat_pre, ones, concat_post},
+                                       ops::Const(scope, 0));
 
-    // tf.concat([shape[:1], ones, shape[1:]], 0)
-    Output new_shape = ops::Concat(s, {concat_pre, ones, concat_post}, const_0);
+        Output reshaped = ops::Reshape(scope, input, new_shape);
 
-    Output result = ops::Reshape(s, input, new_shape);
+        return WrappedTensor(reshaped.node(), 0, true);
+      };
 
-    expanded_inputs.push_back({result.node(), 0, true});
-  }
-
-  inputs->swap(expanded_inputs);
-  return status;
+  *inputs = VectorizerInput({expand_dims_if_unstacked(inputs->at(0), rank_0),
+                             expand_dims_if_unstacked(inputs->at(1), rank_1)});
+  return Status::OK();
 }
 
 // Vectorization helper for component-wise ops. Since these operations act
 // component-wise, the vectorized op is the same as the original.
 Status CwiseVectorizeHelper(const Node& node, Graph* outer_scope,
-                            std::vector<WrappedTensor>&& inputs,
-                            std::vector<WrappedTensor>* outputs) {
+                            VectorizerInput&& inputs,
+                            VectorizerOutput* outputs) {
   // Add new node with the same op type and attrs as the original node
   Node* new_node;
   auto node_builder = NodeBuilder(strings::StrCat("vectorized/", node.name()),
@@ -144,8 +135,8 @@ Status CwiseVectorizeHelper(const Node& node, Graph* outer_scope,
 class UnaryCwiseOpVectorizer : public Vectorizer {
  public:
   Status Vectorize(const Node& node, Graph* outer_scope,
-                   std::vector<WrappedTensor>&& inputs,
-                   std::vector<WrappedTensor>* outputs) override {
+                   VectorizerInput&& inputs,
+                   VectorizerOutput* outputs) override {
     if (inputs.size() != 1) {
       return errors::Internal("Failed to vectorize ", node.type_string(),
                               ". The op should have 1 input, but has ",
@@ -159,8 +150,8 @@ class UnaryCwiseOpVectorizer : public Vectorizer {
 class BinaryCwiseOpVectorizer : public Vectorizer {
  public:
   Status Vectorize(const Node& node, Graph* outer_scope,
-                   std::vector<WrappedTensor>&& inputs,
-                   std::vector<WrappedTensor>* outputs) override {
+                   VectorizerInput&& inputs,
+                   VectorizerOutput* outputs) override {
     if (inputs.size() != 2) {
       return errors::Internal("Failed to vectorize ", node.type_string(),
                               ". The op should have 2 input, but has ",
diff --git a/tensorflow/core/grappler/optimizers/data/vectorization/decode_csv_vectorizer.cc b/tensorflow/core/grappler/optimizers/data/vectorization/decode_csv_vectorizer.cc
index c4460387bbf..76c00477476 100644
--- a/tensorflow/core/grappler/optimizers/data/vectorization/decode_csv_vectorizer.cc
+++ b/tensorflow/core/grappler/optimizers/data/vectorization/decode_csv_vectorizer.cc
@@ -25,30 +25,21 @@ namespace {
 class DecodeCSVVectorizer : public Vectorizer {
  public:
   Status Vectorize(const Node& node, Graph* outer_scope,
-                   std::vector<WrappedTensor>&& inputs,
-                   std::vector<WrappedTensor>* outputs) override {
-    if (!inputs[0].stacked) {
-      return errors::InvalidArgument("Expecting input 0 to be stacked.");
-    }
-    for (size_t i = 1; i < inputs.size(); ++i) {
-      if (inputs[i].stacked) {
-        // Record defaults should not be stacked
-        return errors::InvalidArgument("Expecting input ", i,
-                                       "to be unstacked.");
-      }
-    }
+                   VectorizerInput&& inputs,
+                   VectorizerOutput* outputs) override {
+    NodeBuilder::NodeOut records;
+    TF_RETURN_IF_ERROR(inputs.stacked(0, &records));
 
     std::vector<NodeBuilder::NodeOut> defaults;
-    defaults.reserve(inputs.size() - 1);
+    defaults.resize(inputs.size() - 1);
     for (size_t i = 1; i < inputs.size(); ++i) {
-      defaults.emplace_back(inputs[i].node, inputs[i].output_index);
+      TF_RETURN_IF_ERROR(inputs.unstacked(i, &defaults[i - 1]));
     }
 
     Node* new_node;
-    auto node_builder =
-        NodeBuilder(node.type_string(), node.type_string())
-            .Input(inputs[0].node, inputs[0].output_index)  // records;
-            .Input(defaults);                               // defaults
+    auto node_builder = NodeBuilder(node.type_string(), node.type_string())
+                            .Input(records)
+                            .Input(defaults);
 
     for (const auto& attr : node.attrs()) {
       node_builder = node_builder.Attr(attr.first, attr.second);
diff --git a/tensorflow/core/grappler/optimizers/data/vectorization/parse_single_example_vectorizer.cc b/tensorflow/core/grappler/optimizers/data/vectorization/parse_single_example_vectorizer.cc
index 7d0edfb386d..f81b2d01d99 100644
--- a/tensorflow/core/grappler/optimizers/data/vectorization/parse_single_example_vectorizer.cc
+++ b/tensorflow/core/grappler/optimizers/data/vectorization/parse_single_example_vectorizer.cc
@@ -27,23 +27,15 @@ namespace {
 class ParseSingleExampleVectorizer : public Vectorizer {
  public:
   Status Vectorize(const Node& node, Graph* outer_scope,
-                   std::vector<WrappedTensor>&& inputs,
-                   std::vector<WrappedTensor>* outputs) override {
-    if (!inputs[0].stacked) {
-      return errors::InvalidArgument("Expecting input 0 to be stacked.");
-    }
-    for (size_t i = 1; i < inputs.size(); ++i) {
-      if (inputs[i].stacked) {
-        // Dense defaults should not be stacked
-        return errors::InvalidArgument("Expecting input ", i,
-                                       "to be unstacked.");
-      }
-    }
+                   VectorizerInput&& inputs,
+                   VectorizerOutput* outputs) override {
+    NodeBuilder::NodeOut serialized;
+    TF_RETURN_IF_ERROR(inputs.stacked(0, &serialized));
 
     std::vector<NodeBuilder::NodeOut> dense_defaults;
-    dense_defaults.reserve(inputs.size() - 1);
+    dense_defaults.resize(inputs.size() - 1);
     for (size_t i = 1; i < inputs.size(); ++i) {
-      dense_defaults.emplace_back(inputs[i].node, inputs[i].output_index);
+      TF_RETURN_IF_ERROR(inputs.unstacked(i, &dense_defaults[i - 1]));
     }
 
     Status scope_status;
@@ -79,11 +71,11 @@ class ParseSingleExampleVectorizer : public Vectorizer {
     Node* new_node;
     auto node_builder =
         NodeBuilder(strings::StrCat("vectorized/", node.name()), "ParseExample")
-            .Input(inputs[0].node, inputs[0].output_index)  // serialized
-            .Input(names)                                   // names
-            .Input(sparse_keys)                             // sparse_keys
-            .Input(dense_keys)                              // dense_keys
-            .Input(dense_defaults);                         // dense_defaults
+            .Input(serialized)
+            .Input(names)
+            .Input(sparse_keys)
+            .Input(dense_keys)
+            .Input(dense_defaults);
 
     for (const auto& attr : {"sparse_types", "dense_shapes"}) {
       // Copy attrs if they exist
diff --git a/tensorflow/core/grappler/optimizers/data/vectorization/reshape_vectorizer.cc b/tensorflow/core/grappler/optimizers/data/vectorization/reshape_vectorizer.cc
index dfb855ffa51..a094bfd1de4 100644
--- a/tensorflow/core/grappler/optimizers/data/vectorization/reshape_vectorizer.cc
+++ b/tensorflow/core/grappler/optimizers/data/vectorization/reshape_vectorizer.cc
@@ -47,23 +47,18 @@ Output GetVectorizedShape(Scope* s, Output tensor, Output original_shape) {
 class ReshapeVectorizer : public Vectorizer {
  public:
   Status Vectorize(const Node& node, Graph* outer_scope,
-                   std::vector<WrappedTensor>&& inputs,
-                   std::vector<WrappedTensor>* outputs) override {
-    if (!inputs[0].stacked || inputs[1].stacked) {
-      return errors::InvalidArgument(
-          "Expecting input 0 (`tensor`) to be stacked and input 1 (`shape`) to "
-          "be unstacked.");
-    }
-
+                   VectorizerInput&& inputs,
+                   VectorizerOutput* outputs) override {
     Status status;
     Scope parent = NewInternalScope(outer_scope, &status, nullptr);
     Scope s = parent.NewSubScope(kReshapePrefix);
 
-    Output tensor = {inputs[0].node, inputs[0].output_index};
+    Output tensor, shape;
+    TF_RETURN_IF_ERROR(inputs.stacked(0, &tensor));
+    TF_RETURN_IF_ERROR(inputs.unstacked(1, &shape));
+
     Output vectorized_reshape =
-        ops::Reshape(s, tensor,
-                     GetVectorizedShape(
-                         &s, tensor, {inputs[1].node, inputs[1].output_index}));
+        ops::Reshape(s, tensor, GetVectorizedShape(&s, tensor, shape));
 
     TF_RETURN_IF_ERROR(status);
 
diff --git a/tensorflow/core/grappler/optimizers/data/vectorization/transpose_vectorizer.cc b/tensorflow/core/grappler/optimizers/data/vectorization/transpose_vectorizer.cc
index 4c286d9c4a9..45ad72bb7af 100644
--- a/tensorflow/core/grappler/optimizers/data/vectorization/transpose_vectorizer.cc
+++ b/tensorflow/core/grappler/optimizers/data/vectorization/transpose_vectorizer.cc
@@ -41,20 +41,18 @@ constexpr char kTransposePrefix[] = "vectorized/transpose";
 class TransposeVectorizer : public Vectorizer {
  public:
   Status Vectorize(const Node& node, Graph* outer_scope,
-                   std::vector<WrappedTensor>&& inputs,
-                   std::vector<WrappedTensor>* outputs) override {
-    if (!inputs[0].stacked || inputs[1].stacked) {
-      return errors::InvalidArgument(
-          "Expecting input 0 (`x`) to be stacked and input 1 (`perm`) to "
-          "be unstacked.");
-    }
-
+                   VectorizerInput&& inputs,
+                   VectorizerOutput* outputs) override {
     Status status;
     Scope parent = NewInternalScope(outer_scope, &status, /*refiner=*/nullptr);
     Scope scope = parent.NewSubScope(kTransposePrefix);
 
-    Output tensor = {inputs[0].node, inputs[0].output_index};
-    Output original_perm = {inputs[1].node, inputs[1].output_index};
+    Output tensor, original_perm;
+    TF_RETURN_IF_ERROR(inputs.stacked(0, &tensor));
+    TF_RETURN_IF_ERROR(inputs.unstacked(1, &original_perm));
+    if (original_perm.type() != DT_INT32) {
+      original_perm = ops::Cast(scope, original_perm, DT_INT32);
+    }
 
     // The vectorized permutation is the original permutation with an additional
     // leading 0 and all other values incremented by 1.
diff --git a/tensorflow/core/grappler/optimizers/data/vectorization/unpack_vectorizer.cc b/tensorflow/core/grappler/optimizers/data/vectorization/unpack_vectorizer.cc
index 13b8500eda6..6e00c0cb051 100644
--- a/tensorflow/core/grappler/optimizers/data/vectorization/unpack_vectorizer.cc
+++ b/tensorflow/core/grappler/optimizers/data/vectorization/unpack_vectorizer.cc
@@ -24,16 +24,10 @@ namespace {
 class UnpackVectorizer : public Vectorizer {
  public:
   Status Vectorize(const Node& node, Graph* outer_scope,
-                   std::vector<WrappedTensor>&& inputs,
-                   std::vector<WrappedTensor>* outputs) override {
-    Status s;
-    if (node.num_inputs() != 1 || inputs.size() != 1) {
-      return errors::Internal("Unpack op should only have one input.");
-    }
-
-    // Add new Unpack node with the same op and attrs as the original node
-    auto new_unpack_node = outer_scope->AddNode(node.def(), &s);
-    TF_RETURN_IF_ERROR(s);
+                   VectorizerInput&& inputs,
+                   VectorizerOutput* outputs) override {
+    NodeBuilder::NodeOut value;
+    TF_RETURN_IF_ERROR(inputs.stacked(0, &value));
 
     int axis = 0;
     if (HasNodeAttr(node.def(), "axis")) {
@@ -46,17 +40,21 @@ class UnpackVectorizer : public Vectorizer {
       // Note: negative axis values wrap around.
       axis += 1;
     }
-    new_unpack_node->AddAttr("axis", axis);
-
-    outer_scope->AddEdge(inputs[0].node, inputs[0].output_index,
-                         new_unpack_node, 0);
 
     int num;
     TF_RETURN_IF_ERROR(GetNodeAttr(node.attrs(), "num", &num));
 
+    Node* new_node;
+    TF_RETURN_IF_ERROR(NodeBuilder(strings::StrCat("vectorized/", node.name()),
+                                   node.type_string())
+                           .Input(value)
+                           .Attr("axis", axis)
+                           .Attr("num", num)
+                           .Finalize(outer_scope, &new_node));
+
     // Add the output mappings
     for (int i = 0; i < num; ++i) {
-      outputs->push_back({new_unpack_node, i, true});
+      outputs->push_back({new_node, i, true});
     }
 
     return Status::OK();
diff --git a/tensorflow/core/grappler/optimizers/data/vectorization/vectorizer.h b/tensorflow/core/grappler/optimizers/data/vectorization/vectorizer.h
index 8d4676aae07..7c9905f89ad 100644
--- a/tensorflow/core/grappler/optimizers/data/vectorization/vectorizer.h
+++ b/tensorflow/core/grappler/optimizers/data/vectorization/vectorizer.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include "tensorflow/core/framework/function.pb.h"
 #include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/graph/node_builder.h"
 #include "tensorflow/core/grappler/optimizers/data/vectorization/wrapped_tensor.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
@@ -25,6 +26,72 @@ limitations under the License.
 namespace tensorflow {
 namespace grappler {
 
+// Represents the outputs of a vectorized op. Currently, a simple type alias
+// provided for symmetry with `VectorizerInput`.
+using VectorizerOutput = std::vector<WrappedTensor>;
+
+// Represents the inputs of a vectorized op. Supports iteration, random access,
+// and retrieval of stacked and unstacked tensor inputs.
+class VectorizerInput {
+ public:
+  VectorizerInput(std::vector<WrappedTensor>&& inputs)
+      : inputs_(std::move(inputs)) {}
+
+  // Gets the stacked tensor input at position index. Returns an error if
+  // the tensor at index is unstacked. The type T must have a (Node*, int)
+  // constructor.
+  template <class T>
+  Status stacked(int index, T* result) const {
+    DCHECK_GE(index, 0);
+    DCHECK_LT(index, size());
+
+    if (!inputs_[index].stacked) {
+      return errors::InvalidArgument("Expecting input ", index,
+                                     " to be stacked.");
+    }
+    *result = {inputs_[index].node, inputs_[index].output_index};
+    return Status::OK();
+  }
+
+  // Gets the unstacked tensor input at position index. Returns an error if
+  // the tensor at index is stacked. The type T must have a (Node*, int)
+  // constructor.
+  template <class T>
+  Status unstacked(int index, T* result) const {
+    DCHECK_GE(index, 0);
+    DCHECK_LT(index, size());
+
+    if (inputs_[index].stacked) {
+      return errors::InvalidArgument("Expecting input ", index,
+                                     " to be unstacked.");
+    }
+    *result = {inputs_[index].node, inputs_[index].output_index};
+    return Status::OK();
+  }
+
+  // Returns a const reference to the element at specified location index.
+  const WrappedTensor& at(int index) const {
+    DCHECK_GE(index, 0);
+    DCHECK_LT(index, size());
+    return inputs_.at(index);
+  }
+
+  // Returns a const iterator pointing to the first wrapped tensor input.
+  std::vector<WrappedTensor>::const_iterator begin() const {
+    return inputs_.begin();
+  }
+  // Returns a const iterator pointing to the past-the-end wrapped tensor input.
+  std::vector<WrappedTensor>::const_iterator end() const {
+    return inputs_.end();
+  }
+
+  // Returns the number of input tensors.
+  size_t size() const { return inputs_.size(); }
+
+ private:
+  std::vector<WrappedTensor> inputs_;
+};
+
 // Interface for vectorization of TensorFlow operations. See `CastVectorizer`
 // for an example.
 class Vectorizer {
@@ -40,8 +107,8 @@ class Vectorizer {
   // value in `outputs` corresponds to the i'th output port of the node
   // to be converted.
   virtual Status Vectorize(const Node& node, Graph* outer_scope,
-                           std::vector<WrappedTensor>&& inputs,
-                           std::vector<WrappedTensor>* outputs) = 0;
+                           VectorizerInput&& inputs,
+                           VectorizerOutput* outputs) = 0;
 };
 
 }  // namespace grappler
diff --git a/tensorflow/core/grappler/optimizers/data/vectorization/vectorizer_registry_test.cc b/tensorflow/core/grappler/optimizers/data/vectorization/vectorizer_registry_test.cc
index 054aeb9a8ff..0eee91f241a 100644
--- a/tensorflow/core/grappler/optimizers/data/vectorization/vectorizer_registry_test.cc
+++ b/tensorflow/core/grappler/optimizers/data/vectorization/vectorizer_registry_test.cc
@@ -24,8 +24,8 @@ namespace grappler {
 class TestVectorizer : public Vectorizer {
  public:
   Status Vectorize(const Node& node, Graph* outer_scope,
-                   std::vector<WrappedTensor>&& inputs,
-                   std::vector<WrappedTensor>* outputs) override {
+                   VectorizerInput&& inputs,
+                   VectorizerOutput* outputs) override {
     return Status::OK();
   }
 };
diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer.cc b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
index 1d787d2b7c2..82c88bb06ae 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
@@ -279,6 +279,18 @@ MetaOptimizer::GetCustomGraphOptimizerConfig(const string& name) const {
   return nullptr;
 }
 
+#define RUN_OPTIMIZER_OR_RETURN_IF_ERROR(optimizer)                            \
+  {                                                                            \
+    const Status status = RunOptimizer(optimizer, cluster, &optimized_item,    \
+                                       optimized_graph, &optimization_result); \
+    if (status.ok()) {                                                         \
+      is_optimized = true;                                                     \
+    } else if (cfg_.fail_on_optimizer_errors()) {                              \
+      VLOG(2) << "Optimizer '" << optimizer->name() << "' failed: " << status; \
+      TF_RETURN_IF_ERROR(status);                                              \
+    }                                                                          \
+  }
+
 Status MetaOptimizer::OptimizeGraph(Cluster* cluster, const GrapplerItem& item,
                                     GraphDef* optimized_graph) {
   int min_graph_nodes = cfg_.min_graph_nodes() == 0 ? kDefaultMinGraphNodes
@@ -340,9 +352,7 @@ Status MetaOptimizer::OptimizeGraph(Cluster* cluster, const GrapplerItem& item,
         if (fusion_optimizer == nullptr) fusion_optimizer = optimizer.get();
         continue;
       }
-      Status status = RunOptimizer(optimizer.get(), cluster, &optimized_item,
-                                   optimized_graph, &optimization_result);
-      if (status.ok()) is_optimized = true;
+      RUN_OPTIMIZER_OR_RETURN_IF_ERROR(optimizer.get());
     }
   }
 
@@ -353,16 +363,12 @@ Status MetaOptimizer::OptimizeGraph(Cluster* cluster, const GrapplerItem& item,
   // optimizations from taking place since we don't have shape inference for
   // functions, and we can't optimize across function boundaries.
   if (fusion_optimizer != nullptr) {
-    Status status = RunOptimizer(fusion_optimizer, cluster, &optimized_item,
-                                 optimized_graph, &optimization_result);
-    if (status.ok()) is_optimized = true;
+    RUN_OPTIMIZER_OR_RETURN_IF_ERROR(fusion_optimizer);
   }
 
   // ScopedAllocatorOptimizer must run last.
   if (sa_optimizer != nullptr) {
-    Status status = RunOptimizer(sa_optimizer, cluster, &optimized_item,
-                                 optimized_graph, &optimization_result);
-    if (status.ok()) is_optimized = true;
+    RUN_OPTIMIZER_OR_RETURN_IF_ERROR(sa_optimizer);
   }
 
   // Record graph optimization result.
@@ -379,6 +385,8 @@ Status MetaOptimizer::OptimizeGraph(Cluster* cluster, const GrapplerItem& item,
   return Status::OK();
 }
 
+#undef RUN_OPTIMIZER_OR_RETURN_IF_ERROR
+
 Status MetaOptimizer::RunOptimizer(
     GraphOptimizer* optimizer, Cluster* cluster, GrapplerItem* optimized_item,
     GraphDef* optimized_graph, GraphOptimizationResult* optimization_result) {
diff --git a/tensorflow/core/grappler/optimizers/remapper.cc b/tensorflow/core/grappler/optimizers/remapper.cc
index 9ada8b7ff95..32f603a949c 100644
--- a/tensorflow/core/grappler/optimizers/remapper.cc
+++ b/tensorflow/core/grappler/optimizers/remapper.cc
@@ -27,7 +27,204 @@ limitations under the License.
 namespace tensorflow {
 namespace grappler {
 
-void AddBatchNormNodes(GraphDef* optimized_graph, const NodeDef& fused_node) {
+namespace {
+
+// FusedBatchNorm that can be replaced with a cheaper set of primitives.
+struct FusedBatchNorm {
+  const NodeDef* fused_batch_norm;
+};
+
+// Conv2D node followed by a BiasAdd.
+struct Conv2DWithBiasAdd {
+  const NodeDef* conv2d;
+  const NodeDef* bias_add;
+};
+
+// Conv2D node followed by a BiasAdd and Relu.
+struct Conv2DWithBiasAddAndRelu {
+  const NodeDef* conv2d;
+  const NodeDef* bias_add;
+  const NodeDef* relu;
+};
+
+bool IsFloatOrDoubleDataType(const NodeDef* node,
+                             const string& type_attr = "T") {
+  DataType dtype = GetDataTypeFromAttr(*node, type_attr);
+  return dtype == DT_FLOAT || dtype == DT_DOUBLE;
+}
+
+bool HaveSameDataType(const NodeDef* lhs, const NodeDef* rhs,
+                      const string& type_attr) {
+  DataType lhs_attr = GetDataTypeFromAttr(*lhs, type_attr);
+  DataType rhs_attr = GetDataTypeFromAttr(*rhs, type_attr);
+
+  return lhs_attr != DT_INVALID && rhs_attr != DT_INVALID &&
+         lhs_attr == rhs_attr;
+}
+
+bool FindConv2DWithBias(const GraphView& graph_view, const NodeDef* node,
+                        Conv2DWithBiasAdd* matched) {
+  // Root of the pattern must be a BiasAdd.
+  if (!node) return false;
+  if (!IsBiasAdd(*node)) return false;
+  if (!NodeIsOnCpu(node)) return false;
+  if (!IsFloatOrDoubleDataType(node)) return false;
+  if (!NoControlFaninOrFanout(graph_view, node)) return false;
+
+  // Input to the BiasAdd must be a Conv2D in NHWC format.
+  const auto input_port = GraphView::InputPort(node, 0);
+  const auto conv2d = graph_view.GetRegularFanin(input_port);
+  if (!conv2d.node) return false;
+  if (!IsConv2D(*conv2d.node)) return false;
+  if (conv2d.node->attr().at("data_format").s() != "NHWC") return false;
+  if (!NodeIsOnCpu(conv2d.node)) return false;
+  if (!HaveSameDataType(node, conv2d.node, "T")) return false;
+  if (!NoControlFaninOrFanout(graph_view, conv2d.node)) return false;
+  if (!HasSingleFanoutNode(graph_view, conv2d.node)) return false;
+
+  // We successfully found a Conv2D+BiasAdd pattern.
+  matched->conv2d = conv2d.node;
+  matched->bias_add = node;
+
+  return true;
+}
+
+bool FindConv2DWithBiasAndRelu(const GraphView& graph_view, const NodeDef* node,
+                               Conv2DWithBiasAddAndRelu* matched) {
+  // Root of the pattern must be a Relu.
+  if (!node) return false;
+  if (!IsRelu(*node)) return false;
+  if (!NodeIsOnCpu(node)) return false;
+  if (!IsFloatOrDoubleDataType(node)) return false;
+  if (!NoControlFaninOrFanout(graph_view, node)) return false;
+
+  // And input to Relu must match Conv2DWithBiasAdd pattern.
+  const auto input_port = GraphView::InputPort(node, 0);
+  const auto bias_add = graph_view.GetRegularFanin(input_port);
+
+  Conv2DWithBiasAdd base;
+  if (!FindConv2DWithBias(graph_view, bias_add.node, &base)) return false;
+  if (!HasSingleFanoutNode(graph_view, base.bias_add)) return false;
+  if (!HaveSameDataType(node, base.bias_add, "T")) return false;
+
+  // We successfully found a Conv2D+BiasAdd+Relu pattern.
+  matched->conv2d = base.conv2d;
+  matched->bias_add = base.bias_add;
+  matched->relu = node;
+
+  return true;
+}
+
+// Check that given node meets some basic FusedBatchNorm optimization
+// preconditions. We use this check to lazily infer graph properties which is
+// rather expensive.
+bool IsFusedBatchNormCandidate(const GraphView& graph_view,
+                               const NodeDef& node) {
+  if (!IsFusedBatchNorm(node)) return false;
+  if (GetDataTypeFromAttr(node, "T") != DT_FLOAT) return false;
+
+  // Check that the node is in inference mode.
+  const auto& attr = node.attr();
+  if (attr.count("is_training") > 0 && attr.at("is_training").b()) return false;
+
+  return true;
+}
+
+bool FindFusedBatchNorm(const GraphView& graph_view,
+                        const GraphProperties& graph_properties,
+                        const NodeDef* node, FusedBatchNorm* matched) {
+  if (!IsFusedBatchNormCandidate(graph_view, *node)) return false;
+
+  const auto& props = graph_properties.GetInputProperties(node->name());
+
+  // a. Scaling factor can be const folded:
+  //      scaling_factor = (variance + epsilon).rsqrt() * scale
+  bool const_scaling_factor =
+      props.size() == 5 &&     // [x, scale, offset, mean, variance]
+      props[1].has_value() &&  // scale
+      props[4].has_value();    // variance aka estimated variance
+
+  // b. Or input can be const folded into some other expression.
+  auto const_inputs = std::count_if(
+      props.begin(), props.end(),
+      [](const OpInfo::TensorProperties& props) { return props.has_value(); });
+
+  // TODO(bsteiner): use the cost model to compare the cost of fused batch
+  // norm against that of the optimized form.
+  bool can_remap = const_scaling_factor || const_inputs >= 4;
+  if (!can_remap) return false;
+
+  // The optimized version only generates the first output.
+  for (GraphView::Edge edge : graph_view.GetFanoutEdges(*node, false)) {
+    if (edge.src.port_id != 0) return false;
+  }
+
+  // We found a fused batch norm node that can be replaced with primitive ops.
+  matched->fused_batch_norm = node;
+  return true;
+}
+
+#undef REMAPPER_REQUIRES
+
+void CopyConv2DAttributes(const NodeDef* conv2d, NodeDef* fused_conv2d,
+                          const std::vector<string>& fused_ops = {},
+                          int num_args = 1) {
+  auto* attr = fused_conv2d->mutable_attr();
+  auto src_attr = conv2d->attr();
+
+  (*attr)["T"] = src_attr.at("T");
+  (*attr)["strides"] = src_attr.at("strides");
+  (*attr)["padding"] = src_attr.at("padding");
+  (*attr)["dilations"] = src_attr.at("dilations");
+  (*attr)["data_format"] = src_attr.at("data_format");
+
+  auto* fused_ops_attr = (*attr)["fused_ops"].mutable_list();
+  for (const string& fused_op : fused_ops) {
+    fused_ops_attr->add_s(fused_op);
+  }
+
+  SetAttrValue(num_args, &(*attr)["num_args"]);
+}
+
+void AddFusedConv2DNode(const Conv2DWithBiasAdd& matched,
+                        GraphDef* optimized_graph) {
+  VLOG(2) << "Fuse Conv2D with BiasAdd: bias_add=" << matched.bias_add->name()
+          << " conv2d=" << matched.conv2d->name();
+
+  NodeDef* fused_conv2d = optimized_graph->add_node();
+  fused_conv2d->set_name(matched.bias_add->name());
+  fused_conv2d->set_op("_FusedConv2D");
+  fused_conv2d->set_device(matched.bias_add->device());
+  fused_conv2d->add_input(matched.conv2d->input(0));    // 0: input
+  fused_conv2d->add_input(matched.conv2d->input(1));    // 1: filter
+  fused_conv2d->add_input(matched.bias_add->input(1));  // 2: bias
+
+  CopyConv2DAttributes(matched.conv2d, fused_conv2d, {"BiasAdd"});
+}
+
+void AddFusedConv2DNode(const Conv2DWithBiasAddAndRelu& matched,
+                        GraphDef* optimized_graph) {
+  VLOG(2) << "Fuse Conv2D with BiasAdd and Relu: relu=" << matched.relu->name()
+          << " bias_add=" << matched.bias_add->name()
+          << " conv2d=" << matched.conv2d->name();
+
+  NodeDef* fused_conv2d = optimized_graph->add_node();
+  fused_conv2d->set_name(matched.relu->name());
+  fused_conv2d->set_op("_FusedConv2D");
+  fused_conv2d->set_device(matched.relu->device());
+  fused_conv2d->add_input(matched.conv2d->input(0));    // 0: input
+  fused_conv2d->add_input(matched.conv2d->input(1));    // 1: filter
+  fused_conv2d->add_input(matched.bias_add->input(1));  // 2: bias
+
+  CopyConv2DAttributes(matched.conv2d, fused_conv2d, {"BiasAdd", "Relu"});
+}
+
+void AddBatchNormNodes(const FusedBatchNorm& matched,
+                       GraphDef* optimized_graph) {
+  const NodeDef& fused_node = *matched.fused_batch_norm;
+  VLOG(2) << "Optimizing fused batch norm node "
+          << SummarizeNodeDef(fused_node);
+
   const string& x = fused_node.input(0);
   string scale = fused_node.input(1);
   string offset = fused_node.input(2);
@@ -164,6 +361,7 @@ void AddBatchNormNodes(GraphDef* optimized_graph, const NodeDef& fused_node) {
   *r->add_input() = a->name();
   *r->add_input() = c->name();
 }
+}  // namespace
 
 Status Remapper::Optimize(Cluster* /*cluster*/, const GrapplerItem& item,
                           GraphDef* optimized_graph) {
@@ -171,47 +369,39 @@ Status Remapper::Optimize(Cluster* /*cluster*/, const GrapplerItem& item,
   bool inferred_properties = false;
   GraphView graph(const_cast<GraphDef*>(&item.graph));
 
-  // During inference, most of the inputs to FusedBatchNorm are constant, and we
-  // can therefore replace the op with a much cheaper set of primitives.
+  // Supported graph patterns.
+  FusedBatchNorm fused_batch_norm{};
+  Conv2DWithBiasAdd conv2d_with_bias{};
+  Conv2DWithBiasAddAndRelu conv2d_with_bias_and_relu{};
+
   optimized_graph->mutable_node()->Reserve(item.graph.node_size());
   for (const NodeDef& node : item.graph.node()) {
-    if (node.op() == "FusedBatchNorm" || node.op() == "FusedBatchNormV2") {
-      bool optimizable = (node.attr().count("T") == 0 ||
-                          node.attr().at("T").type() == DT_FLOAT);
-      optimizable &= (node.attr().count("is_training") == 0 ||
-                      !node.attr().at("is_training").b());
-      if (optimizable) {
-        int const_inputs = 0;
-        if (!inferred_properties) {
-          // Infer properties lazily in case they are not needed.
-          TF_RETURN_IF_ERROR(properties.InferStatically(false));
-          inferred_properties = true;
-        }
-        const auto& props = properties.GetInputProperties(node.name());
-        for (const auto& prop : props) {
-          if (prop.has_value()) {
-            const_inputs += 1;
-          }
-        }
-        // TODO(bsteiner): use the cost model to compare the cost of fused batch
-        // norm against that of the optimized form.
-        optimizable = (const_inputs >= 4);
-      }
-      if (optimizable) {
-        for (GraphView::Edge edge : graph.GetFanoutEdges(node, false)) {
-          if (edge.src.port_id != 0) {
-            // The optimized version only generates the first output.
-            optimizable = false;
-            break;
-          }
-        }
-      }
-      if (optimizable) {
-        VLOG(1) << "Optimizing fused batch norm node " << node.DebugString();
-        AddBatchNormNodes(optimized_graph, node);
-        continue;
-      }
+    // Remap Conv2D+BiasAdd into the _FusedConv2DWithBias.
+    if (FindConv2DWithBias(graph, &node, &conv2d_with_bias)) {
+      AddFusedConv2DNode(conv2d_with_bias, optimized_graph);
+      continue;
     }
+
+    // Remap Conv2D+BiasAdd+Relu into the _FusedConv2DWithBias(Relu).
+    if (FindConv2DWithBiasAndRelu(graph, &node, &conv2d_with_bias_and_relu)) {
+      AddFusedConv2DNode(conv2d_with_bias_and_relu, optimized_graph);
+      continue;
+    }
+
+    // Infer properties lazily in case they are not needed.
+    if (!inferred_properties && IsFusedBatchNormCandidate(graph, node)) {
+      TF_RETURN_IF_ERROR(properties.InferStatically(false));
+      inferred_properties = true;
+    }
+
+    // During inference, most of the inputs to FusedBatchNorm are constant, and
+    // we can therefore replace the op with a much cheaper set of primitives.
+    if (FindFusedBatchNorm(graph, properties, &node, &fused_batch_norm)) {
+      AddBatchNormNodes(fused_batch_norm, optimized_graph);
+      continue;
+    }
+
+    // If we didn't match a node to any pattern copy it to the optimized graph.
     *optimized_graph->add_node() = node;
   }
 
diff --git a/tensorflow/core/grappler/optimizers/remapper_test.cc b/tensorflow/core/grappler/optimizers/remapper_test.cc
index 4cbf0d8d6f1..249ca706730 100644
--- a/tensorflow/core/grappler/optimizers/remapper_test.cc
+++ b/tensorflow/core/grappler/optimizers/remapper_test.cc
@@ -91,5 +91,129 @@ TEST_F(RemapperTest, FusedBatchNormNCHW) {
   }
 }
 
+TEST_F(RemapperTest, FuseConv2DWithBias) {
+  using ::tensorflow::ops::Placeholder;
+
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+
+  auto input_shape = ops::Placeholder::Shape({8, 32, 32, 3});
+  auto filter_shape = ops::Placeholder::Shape({1, 1, 3, 128});
+  auto bias_shape = ops::Placeholder::Shape({128});
+
+  auto input = Placeholder(s.WithOpName("input"), DT_FLOAT, input_shape);
+  auto filter = Placeholder(s.WithOpName("filter"), DT_FLOAT, filter_shape);
+  auto bias = Placeholder(s.WithOpName("bias"), DT_FLOAT, bias_shape);
+
+  std::vector<int> strides = {1, 1, 1, 1};
+  auto conv = ops::Conv2D(s.WithOpName("conv"), input, filter, strides, "SAME");
+  auto bias_add = ops::BiasAdd(s.WithOpName("bias_add"), conv, bias);
+  auto fetch = ops::Identity(s.WithOpName("fetch"), bias_add);
+
+  auto input_t = GenerateRandomTensor<DT_FLOAT>({8, 32, 32, 3});
+  auto filter_t = GenerateRandomTensor<DT_FLOAT>({1, 1, 3, 128});
+  auto bias_t = GenerateRandomTensor<DT_FLOAT>({128});
+
+  GrapplerItem item;
+  item.fetch = {"fetch"};
+  item.feed = {{"input", input_t}, {"filter", filter_t}, {"bias", bias_t}};
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  // Place all nodes on CPU.
+  for (int i = 0; i < item.graph.node_size(); ++i) {
+    item.graph.mutable_node(i)->set_device("/device:CPU:0");
+  }
+
+  Remapper optimizer(RewriterConfig::ON);
+  GraphDef output;
+  TF_CHECK_OK(optimizer.Optimize(nullptr, item, &output));
+
+  int found = 0;
+  for (const NodeDef& node : output.node()) {
+    if (node.name() == "bias_add") {
+      EXPECT_EQ("_FusedConv2D", node.op());
+      EXPECT_EQ("input", node.input(0));
+      EXPECT_EQ("filter", node.input(1));
+
+      EXPECT_EQ(1, node.attr().at("num_args").i());
+      EXPECT_EQ("bias", node.input(2));
+
+      const auto fused_ops = node.attr().at("fused_ops").list().s();
+      EXPECT_EQ(1, fused_ops.size());
+      EXPECT_EQ("BiasAdd", fused_ops[0]);
+      found++;
+    }
+  }
+  EXPECT_EQ(1, found);
+
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch, item.feed);
+  auto tensors = EvaluateNodes(output, item.fetch, item.feed);
+  EXPECT_EQ(1, tensors_expected.size());
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
+}
+
+TEST_F(RemapperTest, FuseConv2DWithBiasAndRelu) {
+  using ::tensorflow::ops::Placeholder;
+
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+
+  auto input_shape = Placeholder::Shape({8, 32, 32, 3});
+  auto filter_shape = Placeholder::Shape({1, 1, 3, 128});
+  auto bias_shape = Placeholder::Shape({128});
+
+  auto input = Placeholder(s.WithOpName("input"), DT_FLOAT, input_shape);
+  auto filter = Placeholder(s.WithOpName("filter"), DT_FLOAT, filter_shape);
+  auto bias = Placeholder(s.WithOpName("bias"), DT_FLOAT, bias_shape);
+
+  std::vector<int> strides = {1, 1, 1, 1};
+  auto conv = ops::Conv2D(s.WithOpName("conv"), input, filter, strides, "SAME");
+  auto bias_add = ops::BiasAdd(s.WithOpName("bias_add"), conv, bias);
+  auto relu = ops::Relu(s.WithOpName("relu"), bias_add);
+  auto fetch = ops::Identity(s.WithOpName("fetch"), relu);
+
+  auto input_t = GenerateRandomTensor<DT_FLOAT>({8, 32, 32, 3});
+  auto filter_t = GenerateRandomTensor<DT_FLOAT>({1, 1, 3, 128});
+  auto bias_t = GenerateRandomTensor<DT_FLOAT>({128});
+
+  GrapplerItem item;
+  item.fetch = {"fetch"};
+  item.feed = {{"input", input_t}, {"filter", filter_t}, {"bias", bias_t}};
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  // Place all nodes on CPU.
+  for (int i = 0; i < item.graph.node_size(); ++i) {
+    item.graph.mutable_node(i)->set_device("/device:CPU:0");
+  }
+
+  Remapper optimizer(RewriterConfig::ON);
+  GraphDef output;
+  TF_CHECK_OK(optimizer.Optimize(nullptr, item, &output));
+
+  int found = 0;
+  for (const NodeDef& node : output.node()) {
+    if (node.name() == "relu") {
+      EXPECT_EQ("_FusedConv2D", node.op());
+      EXPECT_EQ("input", node.input(0));
+      EXPECT_EQ("filter", node.input(1));
+
+      EXPECT_EQ(1, node.attr().at("num_args").i());
+      EXPECT_EQ("bias", node.input(2));
+
+      const auto fused_ops = node.attr().at("fused_ops").list().s();
+      ASSERT_EQ(2, fused_ops.size());
+      EXPECT_EQ("BiasAdd", fused_ops[0]);
+      EXPECT_EQ("Relu", fused_ops[1]);
+      found++;
+    }
+  }
+  EXPECT_EQ(1, found);
+
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch, item.feed);
+  auto tensors = EvaluateNodes(output, item.fetch, item.feed);
+  EXPECT_EQ(1, tensors_expected.size());
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
+}
+
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/utils.cc b/tensorflow/core/grappler/utils.cc
index f0f0798035c..9336c4df8b0 100644
--- a/tensorflow/core/grappler/utils.cc
+++ b/tensorflow/core/grappler/utils.cc
@@ -198,6 +198,12 @@ string AsControlDependency(const string& node_name) {
              : strings::StrCat("^", node_name);
 }
 
+bool NodeIsOnCpu(const NodeDef* node) {
+  string task, device;
+  return DeviceNameUtils::SplitDeviceName(node->device(), &task, &device) &&
+         str_util::StartsWith(device, DEVICE_CPU);
+}
+
 int NumOutputs(const NodeDef& node, GraphDef* graph) {
   int num_outputs = 0;
   const OpDef* op_def = nullptr;
@@ -279,11 +285,11 @@ int NumNonControlDataOutputs(const NodeDef& node, const NodeMap& node_map) {
 
 // Returns the data type in attribute `attr_name` of `node`. If that attribute
 // doesn't exist, returns DT_INVALID.
-DataType GetDataTypeFromAttr(const NodeDef& node, const string& attr_name) {
-  if (!node.attr().count(attr_name)) {
+DataType GetDataTypeFromAttr(const NodeDef& node, const string& type_attr) {
+  if (!node.attr().count(type_attr)) {
     return DT_INVALID;
   }
-  const auto& attr = node.attr().at(attr_name);
+  const auto& attr = node.attr().at(type_attr);
   if (attr.value_case() != AttrValue::kType) {
     return DT_INVALID;
   }
diff --git a/tensorflow/core/grappler/utils.h b/tensorflow/core/grappler/utils.h
index 0f756a2dbd3..b1e2d4e9cb5 100644
--- a/tensorflow/core/grappler/utils.h
+++ b/tensorflow/core/grappler/utils.h
@@ -235,6 +235,9 @@ string AsControlDependency(const NodeDef& node);
 // for control dependency, given a node name
 string AsControlDependency(const string& node);
 
+// Returns true if the node is assigned to run on CPU device.
+bool NodeIsOnCpu(const NodeDef* node);
+
 // Returns the number of outputs of a node according to its OpDef. Note that
 // some of the outputs may be unconnected.
 int NumOutputs(const NodeDef& node, GraphDef* graph);
@@ -263,7 +266,7 @@ Status CheckAttrsExist(const NodeDef& node, absl::Span<const string> keys);
 
 // Returns the data type in attribute `attr_name` of `node`. If that attribute
 // doesn't exist, returns DT_INVALID.
-DataType GetDataTypeFromAttr(const NodeDef& node, const string& attr_name);
+DataType GetDataTypeFromAttr(const NodeDef& node, const string& type_attr);
 
 // Returns the last node in the simple chain starting at source and traversing
 // through the input(0) edge from each node as long as the next node satisfies
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index 01f3627d674..18d70422a0b 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -22,6 +22,7 @@ package_group(
         "//learning/brain/research/sparse_matrix/...",
         "//learning/faster_training/...",
         "//tensorflow/...",
+        "//tensorflow_text/...",
         "//third_party/car/...",
     ],
 )
@@ -40,7 +41,6 @@ load(
     "tf_mkl_kernel_library",
     "cc_header_only_library",
     "if_not_windows",
-    "if_override_eigen_strong_inline",
 )
 load("@local_config_sycl//sycl:build_defs.bzl", "if_sycl")
 load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_test")
@@ -93,6 +93,17 @@ config_setting(
     },
 )
 
+config_setting(
+    # Add "--define tensorflow_mkldnn_contraction_kernel=1" to your build command to use mkldnn
+    # sgemm in Eigen tensor contractions (matrix multiplications and convolutions). The mkldnn
+    # kernels are generated at runtime and use avx/avx2/fma/avx512 based on cpu status registers
+    # (https://en.wikipedia.org/wiki/CPUID).
+    name = "mkldnn_contraction_kernel",
+    values = {
+        "define": "tensorflow_mkldnn_contraction_kernel=1",
+    },
+)
+
 # Public support libraries ----------------------------------------------------
 
 cc_library(
@@ -543,6 +554,40 @@ cc_library(
     ],
 )
 
+# Depending on a build configuration this target provides custom kernel for Eigen
+# tensor contractions (small matrix multiplication kernel used to multiple together
+# blocks of the original tensors).
+#
+# 0) Default contraction kernel is Eigen::internal::gebp_kernel.
+#
+# 1) --define tensorflow_mkldnn_contraction_kernel=1
+#    Use Mkldnn single threaded sgemm. The mkldnn kernels are generated at runtime and
+#    use avx/avx2/fma/avx512 based on cpu status registers (https://en.wikipedia.org/wiki/CPUID).
+#
+# If you use `tensor.contract(other_tensor)` in your code, you must include additional header
+# to get the benefit of custom contraction kernel:
+#
+#   #if defined(TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL)
+#   #include "third_party/tensorflow/core/kernels/eigen_contraction_kernel.h"
+#   #endif
+cc_library(
+    name = "eigen_contraction_kernel",
+    hdrs = ["eigen_contraction_kernel.h"],
+    defines = select({
+        ":mkldnn_contraction_kernel": [
+            "TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL",
+            "TENSORFLOW_USE_MKLDNN_CONTRACTION_KERNEL",
+        ],
+        "//conditions:default": [],
+    }),
+    deps = [
+        "//third_party/eigen3",
+    ] + select({
+        ":mkldnn_contraction_kernel": ["//third_party/intel_mkl_dnn:mkldnn_single_threaded"],
+        "//conditions:default": [],
+    }),
+)
+
 cc_library(
     name = "eigen_helpers",
     hdrs = [
@@ -557,6 +602,7 @@ cc_library(
         "eigen_volume_patch.h",
     ],
     deps = [
+        ":eigen_contraction_kernel",
         "//third_party/eigen3",
     ],
 )
@@ -1887,10 +1933,22 @@ tf_kernel_library(
     deps = DATA_FLOW_DEPS,
 )
 
+cc_library(
+    name = "stack",
+    srcs = ["stack.cc"],
+    hdrs = ["stack.h"],
+    deps = [
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
 tf_kernel_library(
     name = "stack_ops",
     prefix = "stack_ops",
-    deps = DATA_FLOW_DEPS,
+    deps = DATA_FLOW_DEPS + [":stack"],
 )
 
 tf_kernel_library(
@@ -2398,6 +2456,23 @@ tf_cc_tests(
     ],
 )
 
+# Conditional test target generation is not supported by the "tf_cc_tests" macro
+# (can't add 'select' to the srcs field, type 'select' is not iterable).
+tf_cc_test(
+    name = "eigen_mkldnn_contraction_kernel_test",
+    size = "small",
+    srcs = select({
+        ":mkldnn_contraction_kernel": ["eigen_mkldnn_contraction_kernel_test.cc"],
+        "//conditions:default": [],
+    }),
+    tags = ["mkldnn_contraction_kernel"],
+    deps = [
+        ":eigen_contraction_kernel",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
 cc_library(
     name = "eigen_benchmark",
     testonly = 1,
@@ -3007,11 +3082,8 @@ tf_kernel_library(
     ]),
     # <prefix>*impl.h are excluded by default from the CPU build, add explicitly.
     hdrs = ["batch_matmul_op_impl.h"],
-    # Override EIGEN_STRONG_INLINE to inline when --define=override_eigen_strong_inline=true,
-    # to avoid long compiling time. See https://github.com/tensorflow/tensorflow/issues/10521
-    copts = if_override_eigen_strong_inline(["/DEIGEN_STRONG_INLINE=inline"]),
     prefix = "batch_matmul_op",
-    deps = MATH_DEPS + if_mkl_ml([
+    deps = MATH_DEPS + [":eigen_contraction_kernel"] + if_mkl_ml([
         "//third_party/mkl:intel_binary_blob",
     ]),
 )
@@ -3077,9 +3149,6 @@ tf_kernel_library(
         "mkl_matmul_op.cc",
     ]),
     hdrs = ["matmul_op.h"],
-    # Override EIGEN_STRONG_INLINE to inline when --define=override_eigen_strong_inline=true,
-    # to avoid long compiling time. See https://github.com/tensorflow/tensorflow/issues/10521
-    copts = if_override_eigen_strong_inline(["/DEIGEN_STRONG_INLINE=inline"]),
     defines = select({
         ":xsmm": [
             "TENSORFLOW_USE_LIBXSMM",
@@ -3088,11 +3157,10 @@ tf_kernel_library(
         "//conditions:default": [],
     }),
     deps = MATH_DEPS + [
+        ":eigen_contraction_kernel",
         ":gpu_util_hdrs",
     ] + select({
-        ":xsmm": [
-            "@libxsmm_archive//:xsmm_avx",
-        ],
+        ":xsmm": ["@libxsmm_archive//:xsmm_avx"],
         "//conditions:default": [],
     }) + mkl_deps() + if_cuda([
         "//tensorflow/core/platform/default/build_config:cublas_plugin",
@@ -3478,9 +3546,6 @@ tf_kernel_library(
         ":xsmm_convolutions": ["xsmm_conv2d.h"],
         "//conditions:default": [],
     }),
-    # Override EIGEN_STRONG_INLINE to inline when --define=override_eigen_strong_inline=true,
-    # to avoid long compiling time. See https://github.com/tensorflow/tensorflow/issues/10521
-    copts = if_override_eigen_strong_inline(["/DEIGEN_STRONG_INLINE=inline"]),
     defines = select({
         ":xsmm_convolutions": [
             "TENSORFLOW_USE_LIBXSMM_CONVOLUTIONS",
@@ -3498,6 +3563,7 @@ tf_kernel_library(
         ":bounds_check",
         ":conv_2d",
         ":conv_3d",
+        ":eigen_contraction_kernel",
         ":image_resizer_state",
         ":fill_functor",
         ":ops_util",
@@ -3588,6 +3654,7 @@ cc_library(
 NN_DEPS = [
     ":bounds_check",
     ":conv_2d",
+    ":eigen_contraction_kernel",
     ":fused_batch_norm_util_gpu",
     ":ops_util",
     ":pooling_ops",
@@ -3597,7 +3664,7 @@ NN_DEPS = [
     "//tensorflow/core:nn_grad",
     "//tensorflow/core:nn_ops_op_lib",
     "//third_party/eigen3",
-]
+] + if_mkl(["//tensorflow/core:mkl_nn_ops_op_lib"])
 
 tf_kernel_library(
     name = "batch_norm_op",
@@ -3636,9 +3703,6 @@ tf_kernel_library(
 
 tf_kernel_library(
     name = "lrn_op",
-    # Override EIGEN_STRONG_INLINE to inline when --define=override_eigen_strong_inline=true,
-    # to avoid long compiling time. See https://github.com/tensorflow/tensorflow/issues/10521
-    copts = if_override_eigen_strong_inline(["/DEIGEN_STRONG_INLINE=inline"]),
     prefix = "lrn_op",
     deps = NN_DEPS,
 )
@@ -5276,7 +5340,9 @@ filegroup(
         "batch_norm_op.h",
         "control_flow_ops.h",
         "conv_2d.h",
+        "conv_3d.h",
         "conv_ops.h",
+        "conv_ops_gpu.h",
         "data_format_ops.h",
         "depthtospace_op.h",
         "depthwise_conv_op.h",
@@ -5295,6 +5361,7 @@ filegroup(
         "mirror_pad_op.h",
         "mirror_pad_op_cpu_impl.h",
         "pad_op.h",
+        "pooling_ops_3d.h",
         "random_op.h",
         "reduction_ops.h",
         "reduction_ops_common.h",
@@ -5342,6 +5409,7 @@ filegroup(
         "conv_grad_ops.cc",
         "conv_grad_ops.h",
         "conv_ops.cc",
+        "conv_ops_3d.cc",
         "conv_ops_fused.cc",
         "conv_ops_using_gemm.cc",
         "crop_and_resize_op.cc",
@@ -5452,6 +5520,7 @@ filegroup(
         "pad_op.cc",
         "padding_fifo_queue.cc",
         "padding_fifo_queue_op.cc",
+        "pooling_ops_3d.cc",
         "queue_base.cc",
         "queue_op.cc",
         "queue_ops.cc",
@@ -5485,6 +5554,8 @@ filegroup(
         "sparse_to_dense_op.cc",
         "spectrogram.cc",
         "spectrogram_op.cc",
+        "stack.cc",
+        "stack.h",
         "stack_ops.cc",
         "string_join_op.cc",
         "string_util.cc",
@@ -6423,6 +6494,10 @@ tf_cc_test(
 
 tf_mkl_kernel_library(
     name = "mkl_conv_op",
+    hdrs = [
+        "mkl_quantized_conv_ops.h",
+        "no_op.h",
+    ],
     prefix = "mkl_conv",
     deps = [
         ":bounds_check",
@@ -6432,6 +6507,7 @@ tf_mkl_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "//tensorflow/core:mkl_nn_ops_op_lib",
         "//tensorflow/core:nn_ops_op_lib",
     ] + mkl_deps(),
 )
@@ -6466,6 +6542,7 @@ tf_mkl_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "//tensorflow/core:mkl_nn_ops_op_lib",
         "//tensorflow/core:nn_ops_op_lib",
     ] + mkl_deps(),
 )
@@ -6481,6 +6558,7 @@ tf_mkl_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "//tensorflow/core:mkl_nn_ops_op_lib",
         "//tensorflow/core:nn_ops_op_lib",
     ] + mkl_deps(),
 )
@@ -6500,6 +6578,7 @@ tf_mkl_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "//tensorflow/core:mkl_nn_ops_op_lib",
         "//tensorflow/core:nn_ops_op_lib",
     ] + mkl_deps(),
 )
@@ -6514,6 +6593,7 @@ tf_mkl_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "//tensorflow/core:mkl_nn_ops_op_lib",
         "//tensorflow/core:nn_ops_op_lib",
         "//third_party/eigen3",
     ] + mkl_deps(),
@@ -6529,6 +6609,7 @@ tf_mkl_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "//tensorflow/core:mkl_nn_ops_op_lib",
         "//tensorflow/core:nn_ops_op_lib",
         "//third_party/eigen3",
     ] + mkl_deps(),
@@ -6549,7 +6630,7 @@ tf_mkl_kernel_library(
 tf_mkl_kernel_library(
     name = "mkl_concat_op",
     prefix = "mkl_concat_op",
-    deps = ARRAY_DEPS + mkl_deps(),
+    deps = [":quantization_utils"] + ARRAY_DEPS + mkl_deps(),
 )
 
 tf_mkl_kernel_library(
diff --git a/tensorflow/core/kernels/batch_matmul_op_impl.h b/tensorflow/core/kernels/batch_matmul_op_impl.h
index 766713a338c..43539ac908f 100644
--- a/tensorflow/core/kernels/batch_matmul_op_impl.h
+++ b/tensorflow/core/kernels/batch_matmul_op_impl.h
@@ -34,6 +34,10 @@ limitations under the License.
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/work_sharder.h"
 
+#if defined(TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL)
+#include "tensorflow/core/kernels/eigen_contraction_kernel.h"
+#endif
+
 #if GOOGLE_CUDA
 #include "tensorflow/core/platform/stream_executor.h"
 #endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/boosted_trees/quantile_ops.cc b/tensorflow/core/kernels/boosted_trees/quantile_ops.cc
index d1840941c1d..81f04732d33 100644
--- a/tensorflow/core/kernels/boosted_trees/quantile_ops.cc
+++ b/tensorflow/core/kernels/boosted_trees/quantile_ops.cc
@@ -29,6 +29,7 @@
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
+#include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/work_sharder.h"
 
@@ -151,8 +152,14 @@ class BoostedTreesMakeQuantileSummariesOp : public OpKernel {
     const Tensor* example_weights_t;
     OP_REQUIRES_OK(context,
                    context->input(kExampleWeightsName, &example_weights_t));
+    DCHECK(float_features_list.size() > 0) << "Got empty feature list";
     auto example_weights = example_weights_t->flat<float>();
-    const int64 batch_size = example_weights.size();
+    const int64 weight_size = example_weights.size();
+    const int64 batch_size = float_features_list[0].flat<float>().size();
+    OP_REQUIRES(
+        context, weight_size == 1 || weight_size == batch_size,
+        errors::InvalidArgument(strings::Printf(
+            "Weights should be a single value or same size as features.")));
     const Tensor* epsilon_t;
     OP_REQUIRES_OK(context, context->input(kEpsilonName, &epsilon_t));
     float epsilon = epsilon_t->scalar<float>()();
@@ -168,7 +175,9 @@ class BoostedTreesMakeQuantileSummariesOp : public OpKernel {
         QuantileStream stream(epsilon, batch_size + 1);
         // Run quantile summary generation.
         for (int64 j = 0; j < batch_size; j++) {
-          stream.PushEntry(feature_values(j), example_weights(j));
+          stream.PushEntry(feature_values(j), (weight_size > 1)
+                                                  ? example_weights(j)
+                                                  : example_weights(0));
         }
         stream.Finalize();
         const auto summary_entry_list = stream.GetFinalSummary().GetEntryList();
@@ -263,6 +272,57 @@ REGISTER_KERNEL_BUILDER(
     Name("BoostedTreesQuantileStreamResourceAddSummaries").Device(DEVICE_CPU),
     BoostedTreesQuantileStreamResourceAddSummariesOp);
 
+class BoostedTreesQuantileStreamResourceDeserializeOp : public OpKernel {
+ public:
+  explicit BoostedTreesQuantileStreamResourceDeserializeOp(
+      OpKernelConstruction* const context)
+      : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr(kNumStreamsName, &num_features_));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    QuantileStreamResource* streams_resource;
+    // Create a reference to the underlying resource using the handle.
+    OP_REQUIRES_OK(context, LookupResource(context, HandleFromInput(context, 0),
+                                           &streams_resource));
+    // Remove the reference at the end of this scope.
+    mutex_lock l(*streams_resource->mutex());
+    core::ScopedUnref unref_me(streams_resource);
+
+    OpInputList bucket_boundaries_list;
+    OP_REQUIRES_OK(context, context->input_list(kBucketBoundariesName,
+                                                &bucket_boundaries_list));
+
+    auto do_quantile_deserialize = [&](const int64 begin, const int64 end) {
+      // Iterating over all streams.
+      for (int64 stream_idx = begin; stream_idx < end; stream_idx++) {
+        const Tensor& bucket_boundaries_t = bucket_boundaries_list[stream_idx];
+        const auto& bucket_boundaries = bucket_boundaries_t.vec<float>();
+        std::vector<float> result;
+        result.reserve(bucket_boundaries.size());
+        for (size_t i = 0; i < bucket_boundaries.size(); ++i) {
+          result.push_back(bucket_boundaries(i));
+        }
+        streams_resource->set_boundaries(result, stream_idx);
+      }
+    };
+
+    // TODO(tanzheny): comment on the magic number.
+    const int64 kCostPerUnit = 500 * num_features_;
+    const DeviceBase::CpuWorkerThreads& worker_threads =
+        *context->device()->tensorflow_cpu_worker_threads();
+    Shard(worker_threads.num_threads, worker_threads.workers, num_features_,
+          kCostPerUnit, do_quantile_deserialize);
+  }
+
+ private:
+  int64 num_features_;
+};
+
+REGISTER_KERNEL_BUILDER(
+    Name("BoostedTreesQuantileStreamResourceDeserialize").Device(DEVICE_CPU),
+    BoostedTreesQuantileStreamResourceDeserializeOp);
+
 class BoostedTreesQuantileStreamResourceFlushOp : public OpKernel {
  public:
   explicit BoostedTreesQuantileStreamResourceFlushOp(
@@ -409,28 +469,29 @@ class BoostedTreesBucketizeOp : public OpKernel {
         const int64 num_values = values_tensor.dim_size(0);
 
         Tensor* output_t = nullptr;
-        OP_REQUIRES_OK(
-            context, buckets_list.allocate(
-                         feature_idx, TensorShape({num_values, 1}), &output_t));
-        auto output = output_t->matrix<int32>();
+        OP_REQUIRES_OK(context,
+                       buckets_list.allocate(
+                           feature_idx, TensorShape({num_values}), &output_t));
+        auto output = output_t->flat<int32>();
 
         const std::vector<float>& bucket_boundaries_vector =
             GetBuckets(feature_idx, bucket_boundaries_list);
-        CHECK(!bucket_boundaries_vector.empty())
-            << "Got empty buckets for feature " << feature_idx;
         auto flat_values = values_tensor.flat<float>();
+        const auto& iter_begin = bucket_boundaries_vector.begin();
+        const auto& iter_end = bucket_boundaries_vector.end();
         for (int64 instance = 0; instance < num_values; instance++) {
+          if (iter_begin == iter_end) {
+            output(instance) = 0;
+            continue;
+          }
           const float value = flat_values(instance);
-          auto bucket_iter =
-              std::lower_bound(bucket_boundaries_vector.begin(),
-                               bucket_boundaries_vector.end(), value);
-          if (bucket_iter == bucket_boundaries_vector.end()) {
+          auto bucket_iter = std::lower_bound(iter_begin, iter_end, value);
+          if (bucket_iter == iter_end) {
             --bucket_iter;
           }
-          const int32 bucket = static_cast<int32>(
-              bucket_iter - bucket_boundaries_vector.begin());
+          const int32 bucket = static_cast<int32>(bucket_iter - iter_begin);
           // Bucket id.
-          output(instance, 0) = bucket;
+          output(instance) = bucket;
         }
       }
     };
diff --git a/tensorflow/core/kernels/boosted_trees/quantiles/weighted_quantiles_summary.h b/tensorflow/core/kernels/boosted_trees/quantiles/weighted_quantiles_summary.h
index 31d7fe25a47..5690c3a6014 100644
--- a/tensorflow/core/kernels/boosted_trees/quantiles/weighted_quantiles_summary.h
+++ b/tensorflow/core/kernels/boosted_trees/quantiles/weighted_quantiles_summary.h
@@ -39,7 +39,7 @@ class WeightedQuantilesSummary {
       // Explicitly initialize all of memory (including padding from memory
       // alignment) to allow the struct to be msan-resistant "plain old data".
       //
-      // POD = http://en.cppreference.com/w/cpp/concept/PODType
+      // POD = https://en.cppreference.com/w/cpp/named_req/PODType
       memset(this, 0, sizeof(*this));
 
       value = v;
diff --git a/tensorflow/core/kernels/check_numerics_op_gpu.cu.cc b/tensorflow/core/kernels/check_numerics_op_gpu.cu.cc
index 87bdba14550..f9f10c1b42f 100644
--- a/tensorflow/core/kernels/check_numerics_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/check_numerics_op_gpu.cu.cc
@@ -60,9 +60,9 @@ template <typename T>
 struct CheckNumericsLaunch {
   void Run(const GPUDevice &d, const T *data, int size,
            int abnormal_detected[2]) {
-    const int32 block_size = d.maxCudaThreadsPerBlock();
+    const int32 block_size = d.maxGpuThreadsPerBlock();
     const int32 num_blocks =
-        (d.getNumCudaMultiProcessors() * d.maxCudaThreadsPerMultiProcessor()) /
+        (d.getNumGpuMultiProcessors() * d.maxGpuThreadsPerMultiProcessor()) /
         block_size;
 
     CheckNumericsKernel<T><<<num_blocks, block_size, 0, d.stream()>>>(
diff --git a/tensorflow/core/kernels/constant_op_test.cc b/tensorflow/core/kernels/constant_op_test.cc
index 0faad11e472..3988c190e70 100644
--- a/tensorflow/core/kernels/constant_op_test.cc
+++ b/tensorflow/core/kernels/constant_op_test.cc
@@ -79,7 +79,7 @@ void ConstantOpTest::PersistentMemoryTrackingTest(bool on_gpu) {
   }
 
   // Remove memory leak errors.
-  for (auto allocator_pair : ctx.wrapped_allocators()) {
+  for (auto allocator_pair : ctx.ConsumeWrappedAllocators()) {
     allocator_pair.second->GetRecordsAndUnRef();
   }
 }
diff --git a/tensorflow/core/kernels/control_flow_ops.cc b/tensorflow/core/kernels/control_flow_ops.cc
index fd3a0ad4223..382c9d5e503 100644
--- a/tensorflow/core/kernels/control_flow_ops.cc
+++ b/tensorflow/core/kernels/control_flow_ops.cc
@@ -70,9 +70,12 @@ void SwitchOp::Compute(OpKernelContext* context) {
 TF_CALL_ALL_TYPES(REGISTER_CPU_SWITCH);
 TF_CALL_ALL_TYPES(REGISTER_CPU_REF_SWITCH);
 TF_CALL_QUANTIZED_TYPES(REGISTER_CPU_SWITCH);
+TF_CALL_QUANTIZED_TYPES(REGISTER_CPU_REF_SWITCH);
 
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_SWITCH);
+TF_CALL_QUANTIZED_TYPES(REGISTER_GPU_SWITCH);
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_REF_SWITCH);
+TF_CALL_QUANTIZED_TYPES(REGISTER_GPU_REF_SWITCH);
 
 #undef REGISTER_CPU_SWITCH
 #undef REGISTER_CPU_REF_SWITCH
@@ -256,6 +259,8 @@ REGISTER_KERNEL_BUILDER(Name("RefMerge").Device(DEVICE_CPU), MergeOp);
 
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_KERNEL);
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_REF_KERNEL);
+TF_CALL_QUANTIZED_TYPES(REGISTER_GPU_KERNEL);
+TF_CALL_QUANTIZED_TYPES(REGISTER_GPU_REF_KERNEL);
 REGISTER_GPU_KERNEL(bool);
 REGISTER_GPU_REF_KERNEL(bool);
 
diff --git a/tensorflow/core/kernels/conv_2d.h b/tensorflow/core/kernels/conv_2d.h
index 639c3062cc6..a6964b1aacb 100644
--- a/tensorflow/core/kernels/conv_2d.h
+++ b/tensorflow/core/kernels/conv_2d.h
@@ -51,42 +51,47 @@ struct InflatePadAndShuffle {
   }
 };
 
-template <typename Device, typename Input, typename Filter, typename Output>
+template <typename Device, typename Input, typename Filter, typename Output,
+          typename OutputKernel>
 void SpatialConvolutionFunc(const Device& d, Output output, Input input,
                             Filter filter, int row_stride, int col_stride,
                             int row_dilation, int col_dilation,
-                            const Eigen::PaddingType& padding) {
+                            const Eigen::PaddingType& padding,
+                            const OutputKernel& output_kernel) {
   // Need to swap row/col when calling Eigen.
   output.device(d) =
       Eigen::SpatialConvolution(input, filter, col_stride, row_stride, padding,
-                                col_dilation, row_dilation);
+                                col_dilation, row_dilation, output_kernel);
 }
 
-template <typename Device, typename T>
+template <typename Device, typename T,
+          typename OutputKernel = const Eigen::NoOpOutputKernel>
 struct SpatialConvolution {
   void operator()(const Device& d, typename TTypes<T, 4>::Tensor output,
                   typename TTypes<T, 4>::ConstTensor input,
                   typename TTypes<T, 4>::ConstTensor filter, int row_stride,
                   int col_stride, int row_dilation, int col_dilation,
-                  const Eigen::PaddingType& padding) {
+                  const Eigen::PaddingType& padding,
+                  const OutputKernel& output_kernel = OutputKernel()) {
     SpatialConvolutionFunc(d, output, input, filter, row_stride, col_stride,
-                           row_dilation, col_dilation, padding);
+                           row_dilation, col_dilation, padding, output_kernel);
   }
 };
 
-template <typename Device>
-struct SpatialConvolution<Device, Eigen::half> {
+template <typename Device, typename OutputKernel>
+struct SpatialConvolution<Device, Eigen::half, OutputKernel> {
   void operator()(const Device& d,
                   typename TTypes<Eigen::half, 4>::Tensor output,
                   typename TTypes<Eigen::half, 4>::ConstTensor input,
                   typename TTypes<Eigen::half, 4>::ConstTensor filter,
                   int row_stride, int col_stride, int row_dilation,
-                  int col_dilation, const Eigen::PaddingType& padding) {
+                  int col_dilation, const Eigen::PaddingType& padding,
+                  const OutputKernel& output_kernel = OutputKernel()) {
     output.device(d) =
         Eigen::SpatialConvolution(input.cast<float>(), filter.cast<float>(),
                                   col_stride, row_stride, padding, col_dilation,
-                                  row_dilation)
-            .cast<Eigen::half>();
+                                  row_dilation, output_kernel)
+            .template cast<Eigen::half>();
   }
 };
 
@@ -124,7 +129,8 @@ struct SpatialConvolutionBackwardFilter {
 // TODO(vrv): Figure out how to use the MatMulFunctor in matmul_op.h.
 // My initial attempt to do this compiled but failed in the pytest
 // due to a swigdeps error.
-template <typename Device, typename T>
+template <typename Device, typename T,
+          typename OutputKernel = const Eigen::NoOpOutputKernel>
 struct MatMulConvFunctor {
   // Computes on device "d": out = in0 * in1, where * is matrix
   // multiplication.
@@ -132,8 +138,9 @@ struct MatMulConvFunctor {
       const Device& d, typename TTypes<T, 2>::Tensor out,
       typename TTypes<T, 2>::ConstTensor in0,
       typename TTypes<T, 2>::ConstTensor in1,
-      const Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1>& dim_pair) {
-    out.device(d) = in0.contract(in1, dim_pair);
+      const Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1>& dim_pair,
+      const OutputKernel& output_kernel = OutputKernel()) {
+    out.device(d) = in0.contract(in1, dim_pair, output_kernel);
   }
 };
 
diff --git a/tensorflow/core/kernels/conv_grad_filter_ops.cc b/tensorflow/core/kernels/conv_grad_filter_ops.cc
index 9e86a16b66d..bc30da40991 100644
--- a/tensorflow/core/kernels/conv_grad_filter_ops.cc
+++ b/tensorflow/core/kernels/conv_grad_filter_ops.cc
@@ -44,6 +44,10 @@ limitations under the License.
 #include "tensorflow/core/util/use_cudnn.h"
 #include "tensorflow/core/util/work_sharder.h"
 
+#if defined(TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL)
+#include "tensorflow/core/kernels/eigen_contraction_kernel.h"
+#endif
+
 #if GOOGLE_CUDA
 #include "tensorflow/core/kernels/conv_ops_gpu.h"
 #include "tensorflow/core/platform/stream_executor.h"
diff --git a/tensorflow/core/kernels/conv_grad_input_ops.cc b/tensorflow/core/kernels/conv_grad_input_ops.cc
index 43bb5ea56c9..e06af15f2fc 100644
--- a/tensorflow/core/kernels/conv_grad_input_ops.cc
+++ b/tensorflow/core/kernels/conv_grad_input_ops.cc
@@ -43,6 +43,10 @@ limitations under the License.
 #include "tensorflow/core/util/use_cudnn.h"
 #include "tensorflow/core/util/work_sharder.h"
 
+#if defined(TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL)
+#include "tensorflow/core/kernels/eigen_contraction_kernel.h"
+#endif
+
 #if GOOGLE_CUDA
 #include "tensorflow/core/kernels/conv_ops_gpu.h"
 #include "tensorflow/core/platform/stream_executor.h"
diff --git a/tensorflow/core/kernels/conv_grad_ops_3d.cc b/tensorflow/core/kernels/conv_grad_ops_3d.cc
index bab91f5e861..f62c60d255d 100644
--- a/tensorflow/core/kernels/conv_grad_ops_3d.cc
+++ b/tensorflow/core/kernels/conv_grad_ops_3d.cc
@@ -35,6 +35,10 @@ limitations under the License.
 #include "tensorflow/core/util/use_cudnn.h"
 #include "tensorflow/core/util/work_sharder.h"
 
+#if defined(TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL)
+#include "tensorflow/core/kernels/eigen_contraction_kernel.h"
+#endif
+
 #if GOOGLE_CUDA
 #include "tensorflow/core/platform/stream_executor.h"
 using stream_executor::dnn::DimIndex;
diff --git a/tensorflow/core/kernels/conv_ops.cc b/tensorflow/core/kernels/conv_ops.cc
index 2b273d6ff2b..74857fc2078 100644
--- a/tensorflow/core/kernels/conv_ops.cc
+++ b/tensorflow/core/kernels/conv_ops.cc
@@ -867,34 +867,36 @@ void LaunchConv2DOp<GPUDevice, T>::operator()(
 
 // Forward declarations of the functor specializations for GPU.
 namespace functor {
-#define DECLARE_GPU_SPEC(T)                                                  \
-  template <>                                                                \
-  void SpatialConvolution<GPUDevice, T>::operator()(                         \
-      const GPUDevice& d, typename TTypes<T, 4>::Tensor output,              \
-      typename TTypes<T, 4>::ConstTensor input,                              \
-      typename TTypes<T, 4>::ConstTensor filter, int row_stride,             \
-      int col_stride, int row_dilation, int col_dilation,                    \
-      const Eigen::PaddingType& padding);                                    \
-  extern template struct SpatialConvolution<GPUDevice, T>;                   \
-  template <>                                                                \
-  void MatMulConvFunctor<GPUDevice, T>::operator()(                          \
-      const GPUDevice& d, typename TTypes<T, 2>::Tensor out,                 \
-      typename TTypes<T, 2>::ConstTensor in0,                                \
-      typename TTypes<T, 2>::ConstTensor in1,                                \
-      const Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1>& dim_pair); \
-  extern template struct MatMulConvFunctor<GPUDevice, T>;                    \
-  template <>                                                                \
-  void TransformFilter<GPUDevice, T, int, 4>::operator()(                    \
-      const GPUDevice& d, FilterTensorFormat dst_filter_format,              \
-      typename TTypes<T, 4, int>::ConstTensor in,                            \
-      typename TTypes<T, 4, int>::Tensor out);                               \
-  extern template struct TransformFilter<GPUDevice, T, int, 4>;              \
-  template <>                                                                \
-  void PadInput<GPUDevice, T, int, 4>::operator()(                           \
-      const GPUDevice& d, typename TTypes<T, 4, int>::ConstTensor in,        \
-      const std::array<int, 2>& padding_left,                                \
-      const std::array<int, 2>& padding_right,                               \
-      typename TTypes<T, 4, int>::Tensor out, TensorFormat data_format);     \
+#define DECLARE_GPU_SPEC(T)                                                 \
+  template <>                                                               \
+  void SpatialConvolution<GPUDevice, T>::operator()(                        \
+      const GPUDevice& d, typename TTypes<T, 4>::Tensor output,             \
+      typename TTypes<T, 4>::ConstTensor input,                             \
+      typename TTypes<T, 4>::ConstTensor filter, int row_stride,            \
+      int col_stride, int row_dilation, int col_dilation,                   \
+      const Eigen::PaddingType& padding,                                    \
+      const Eigen::NoOpOutputKernel& output_kernel);                        \
+  extern template struct SpatialConvolution<GPUDevice, T>;                  \
+  template <>                                                               \
+  void MatMulConvFunctor<GPUDevice, T>::operator()(                         \
+      const GPUDevice& d, typename TTypes<T, 2>::Tensor out,                \
+      typename TTypes<T, 2>::ConstTensor in0,                               \
+      typename TTypes<T, 2>::ConstTensor in1,                               \
+      const Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1>& dim_pair, \
+      const Eigen::NoOpOutputKernel& output_kernel);                        \
+  extern template struct MatMulConvFunctor<GPUDevice, T>;                   \
+  template <>                                                               \
+  void TransformFilter<GPUDevice, T, int, 4>::operator()(                   \
+      const GPUDevice& d, FilterTensorFormat dst_filter_format,             \
+      typename TTypes<T, 4, int>::ConstTensor in,                           \
+      typename TTypes<T, 4, int>::Tensor out);                              \
+  extern template struct TransformFilter<GPUDevice, T, int, 4>;             \
+  template <>                                                               \
+  void PadInput<GPUDevice, T, int, 4>::operator()(                          \
+      const GPUDevice& d, typename TTypes<T, 4, int>::ConstTensor in,       \
+      const std::array<int, 2>& padding_left,                               \
+      const std::array<int, 2>& padding_right,                              \
+      typename TTypes<T, 4, int>::Tensor out, TensorFormat data_format);    \
   extern template struct PadInput<GPUDevice, T, int, 4>
 
 DECLARE_GPU_SPEC(float);
diff --git a/tensorflow/core/kernels/conv_ops_fused.cc b/tensorflow/core/kernels/conv_ops_fused.cc
index 972100ba778..c75bb679322 100644
--- a/tensorflow/core/kernels/conv_ops_fused.cc
+++ b/tensorflow/core/kernels/conv_ops_fused.cc
@@ -30,9 +30,11 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_slice.h"
 #include "tensorflow/core/kernels/bounds_check.h"
+#include "tensorflow/core/kernels/conv_2d.h"
 #include "tensorflow/core/kernels/conv_ops.h"
 #include "tensorflow/core/kernels/gemm_functors.h"
 #include "tensorflow/core/kernels/image_resizer_state.h"
+#include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/util/mirror_pad_mode.h"
 #include "tensorflow/core/util/padding.h"
@@ -898,4 +900,289 @@ TF_CALL_half(REGISTER_PAD_ONLY_FUSED);
 TF_CALL_float(REGISTER_PAD_ONLY_FUSED);
 TF_CALL_double(REGISTER_PAD_ONLY_FUSED);
 
+// Support for fusing computationally cheap, but memory bandwidth expensive
+// computations into the output of convolution to reduce the overall latency.
+//
+// Example: Fuse Conv2D+BiasAdd+Relu.
+
+namespace {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+
+// Type aliases for the unaligned tensors (tensor maps) used in output kernels.
+template <typename T>
+struct OutputTypes {
+  // There is no guarantee that the output block passed to the output kernel
+  // will be aligned.
+
+  using Tensor =
+      Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>,
+                       Eigen::Unaligned>;
+
+  using ConstTensor = Eigen::TensorMap<
+      Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>,
+      Eigen::Unaligned>;
+};
+
+// Type alias for the tensor contraction output mapper.
+template <typename Scalar, typename Index>
+using ContractionOutputMapper =
+    Eigen::internal::blas_data_mapper<Scalar, Index, Eigen::ColMajor>;
+
+// Output kernel that fused BiasAdd operation into the output of tensor
+// contraction.
+template <typename T>
+struct BiasAddOutputKernel {
+  explicit BiasAddOutputKernel(const T* bias_data) : bias_data(bias_data) {}
+
+  template <typename Index, typename Scalar>
+  EIGEN_ALWAYS_INLINE void operator()(
+      const ContractionOutputMapper<Scalar, Index>& output_mapper,
+      const Eigen::TensorContractionParams& params, Index i, Index j,
+      Index num_rows, Index num_cols) const {
+    DCHECK(params.swapped_arguments);
+
+    const T* bias_base = bias_data + i;
+
+    // TODO(ezhulenev): Use Eigen::Array with strides after upgrading Eigen.
+    for (int col = 0; col < num_cols; ++col) {
+      T* output_base = &output_mapper(0, col);
+      typename OutputTypes<T>::Tensor output(output_base, num_rows);
+      typename OutputTypes<T>::ConstTensor bias(bias_base, num_rows);
+      output = output + bias;
+    }
+  }
+
+ private:
+  const T* bias_data;
+};
+
+// Output kernel that fused BiasAdd and Relu operations into the output of
+// tensor contraction.
+template <typename T>
+struct BiasAddWithReluOutputKernel {
+  explicit BiasAddWithReluOutputKernel(const T* bias_data)
+      : bias_data(bias_data) {}
+
+  template <typename Index, typename Scalar>
+  EIGEN_ALWAYS_INLINE void operator()(
+      const ContractionOutputMapper<Scalar, Index>& output_mapper,
+      const Eigen::TensorContractionParams& params, Index i, Index j,
+      Index num_rows, Index num_cols) const {
+    DCHECK(params.swapped_arguments);
+
+    const T* bias_base = bias_data + i;
+
+    // TODO(ezhulenev): Use Eigen::Array with strides after upgrading Eigen.
+    for (int col = 0; col < num_cols; ++col) {
+      T* output_base = &output_mapper(0, col);
+      typename OutputTypes<T>::Tensor output(output_base, num_rows);
+      typename OutputTypes<T>::ConstTensor bias(bias_base, num_rows);
+      output = (output + bias).cwiseMax(static_cast<T>(0));
+    }
+  }
+
+ private:
+  const T* bias_data;
+};
+
+// Type aliases for the output kernels, purely for the sake of better launch
+// dispatching code readability.
+template <typename T>
+using WithBiasAdd = BiasAddOutputKernel<T>;
+template <typename T>
+using WithBiasAddAndRelu = BiasAddWithReluOutputKernel<T>;
+
+// Dispatch 2D convolution to the appropriate primitive operation:
+//   (1) MatMul for the case of 1x1 convolution.
+//   (2) MatMul for the case when filter size equals to the input size.
+//   (3) General spatial 2D convolution for all other cases.
+template <typename T, typename OutputKernel>
+struct LaunchConv2DWithOutputKernel {
+  void operator()(OpKernelContext* ctx, const Tensor& input,
+                  const Tensor& filter, int row_stride, int col_stride,
+                  int row_dilation, int col_dilation, const Padding& padding,
+                  const OutputKernel& output_kernel, Tensor* output,
+                  TensorFormat data_format) {
+    if (filter.dim_size(0) == 1 && filter.dim_size(1) == 1 && row_stride == 1 &&
+        col_stride == 1) {
+      int conv_width = 1;  // Width for the convolution step.
+      for (int i = 0; i < 3; ++i) {
+        conv_width *= output->dim_size(i);
+      }
+
+      Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1> dim_pair;
+      dim_pair[0] = Eigen::IndexPair<Eigen::DenseIndex>(1, 0);
+      functor::MatMulConvFunctor<CPUDevice, T, OutputKernel>()(
+          ctx->eigen_device<CPUDevice>(),
+          output->shaped<T, 2>({conv_width, filter.dim_size(3)}),
+          input.shaped<T, 2>({conv_width, filter.dim_size(2)}),
+          filter.shaped<T, 2>({filter.dim_size(2), filter.dim_size(3)}),
+          dim_pair, output_kernel);
+
+    } else if (filter.dim_size(0) == input.dim_size(1) &&
+               filter.dim_size(1) == input.dim_size(2) && row_dilation == 1 &&
+               col_dilation == 1 && padding == VALID) {
+      // If the input data and filter have the same height/width,
+      // reduce the 2D convolution to matrix multiplication.
+      const auto k =  // Length of reduction dimension.
+          filter.dim_size(0) * filter.dim_size(1) * filter.dim_size(2);
+
+      Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1> dim_pair;
+      dim_pair[0] = Eigen::IndexPair<Eigen::DenseIndex>(1, 0);
+      functor::MatMulConvFunctor<CPUDevice, T, OutputKernel>()(
+          ctx->eigen_device<CPUDevice>(),
+          output->shaped<T, 2>({input.dim_size(0), filter.dim_size(3)}),
+          input.shaped<T, 2>({input.dim_size(0), k}),
+          filter.shaped<T, 2>({k, filter.dim_size(3)}), dim_pair,
+          output_kernel);
+
+    } else {
+      functor::SpatialConvolution<CPUDevice, T, OutputKernel>()(
+          ctx->eigen_device<CPUDevice>(), output->tensor<T, 4>(),
+          input.tensor<T, 4>(), filter.tensor<T, 4>(), row_stride, col_stride,
+          row_dilation, col_dilation, BrainPadding2EigenPadding(padding),
+          output_kernel);
+    }
+  }
+};
+
+}  // namespace
+
+// Conv2D op with fused output kernels. Supports only CPUDevice.
+template <typename T>
+class FusedConv2DOp : public OpKernel {
+ public:
+  explicit FusedConv2DOp(OpKernelConstruction* context) : OpKernel(context) {
+    OP_REQUIRES_OK(context, InitConv2DParameters(context, &params_));
+
+    // 'fused_ops' and 'num_args' attributes are specified by the Grappler
+    // Remapper optimizer.
+
+    std::vector<string> fused_ops;
+    OP_REQUIRES_OK(context, context->GetAttr("fused_ops", &fused_ops));
+    OP_REQUIRES(context, !fused_ops.empty(),
+                errors::InvalidArgument(
+                    "Fused Conv2D must have at least one fused op."));
+
+    // Right now we always expect to have just one extra argument that is an
+    // input to the BiasAdd. In future we might fuse other types of computations
+    // taking additional arguments.
+
+    int num_args;
+    OP_REQUIRES_OK(context, context->GetAttr("num_args", &num_args));
+    OP_REQUIRES(context, num_args == 1,
+                errors::InvalidArgument(
+                    "Fused Conv2D must have one extra argument with a bias."));
+
+    // TODO(ezhulenev): Add support for fusion element-wise op chains defined
+    // at runtime, e.g. Relu+Sqrt+Tanh+etc...
+
+    if (FusedOpsMatches(fused_ops, {"BiasAdd"})) {
+      fused_computation_ = FusedComputationType::kBiasAdd;
+    } else if (FusedOpsMatches(fused_ops, {"BiasAdd", "Relu"})) {
+      fused_computation_ = FusedComputationType::kBiasAddWithRelu;
+    } else {
+      OP_REQUIRES(context, false,
+                  errors::Unimplemented("Fusion is not implemented: ",
+                                        str_util::Join(fused_ops, ",")));
+    }
+  }
+
+  void Compute(OpKernelContext* context) override {
+    // Input tensor is of the following dimensions:
+    // [ batch, in_rows, in_cols, in_depth ]
+    const Tensor& input = context->input(0);
+
+    // Input filter is of the following dimensions:
+    // [ filter_rows, filter_cols, in_depth, out_depth]
+    const Tensor& filter = context->input(1);
+
+    // Bias of the following dimensions:
+    // [ output_depth ]
+    const Tensor& bias = context->input(2);
+
+    Conv2DDimensions dimensions;
+    OP_REQUIRES_OK(context,
+                   ComputeConv2DDimension(params_, input, filter, &dimensions));
+
+    TensorShape out_shape = ShapeFromFormat(
+        params_.data_format, dimensions.batch, dimensions.out_rows,
+        dimensions.out_cols, dimensions.out_depth);
+
+    // Output tensor is of the following dimensions:
+    // [ in_batch, out_rows, out_cols, out_depth ]
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output));
+
+    VLOG(2) << "FusedConv2DWithBias: in_depth = " << dimensions.in_depth
+            << ", patch_depth = " << dimensions.patch_depth
+            << ", input_cols = " << dimensions.input_cols
+            << ", filter_cols = " << dimensions.filter_cols
+            << ", input_rows = " << dimensions.input_rows
+            << ", filter_rows = " << dimensions.filter_rows
+            << ", stride_rows = " << dimensions.stride_rows
+            << ", stride_cols = " << dimensions.stride_cols
+            << ", dilation_rows = " << dimensions.dilation_rows
+            << ", dilation_cols = " << dimensions.dilation_cols
+            << ", out_depth = " << dimensions.out_depth;
+
+    // If there is nothing to compute, return.
+    if (out_shape.num_elements() == 0) {
+      return;
+    }
+
+    OP_REQUIRES(context, params_.data_format == FORMAT_NHWC,
+                errors::Unimplemented("Fused conv implementation only supports "
+                                      "NHWC tensor format for now."));
+    OP_REQUIRES(context, dimensions.in_depth == filter.dim_size(2),
+                errors::Unimplemented("Fused conv implementation does not "
+                                      "support grouped convolutions for now."));
+
+    auto bias_data = reinterpret_cast<const T*>(bias.tensor_data().data());
+
+#define LAUNCH_CONV2D(KERNEL)                                                 \
+  LaunchConv2DWithOutputKernel<T, KERNEL>()(                                  \
+      context, input, filter, dimensions.stride_rows, dimensions.stride_cols, \
+      dimensions.dilation_rows, dimensions.dilation_cols, params_.padding,    \
+      KERNEL(bias_data), output, params_.data_format);                        \
+  break
+
+    switch (fused_computation_) {
+      case FusedComputationType::kBiasAdd:
+        LAUNCH_CONV2D(WithBiasAdd<T>);
+      case FusedComputationType::kBiasAddWithRelu:
+        LAUNCH_CONV2D(WithBiasAddAndRelu<T>);
+    }
+  }
+#undef LAUNCH_CONV2D
+
+ private:
+  bool FusedOpsMatches(const std::vector<string>& fused_ops,
+                       const std::vector<string>& expected) const {
+    return fused_ops == expected;
+  }
+
+  // Element-wise ops applied to the result of Conv2D.
+  // TODO(ezhulenev): Add support for runtime-defined op chains.
+  enum class FusedComputationType { kBiasAdd, kBiasAddWithRelu };
+
+  Conv2DParameters params_;
+  FusedComputationType fused_computation_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(FusedConv2DOp);
+};
+
+#define REGISTER_FUSED_CONV2D(T)                                      \
+  REGISTER_KERNEL_BUILDER(                                            \
+      Name("_FusedConv2D").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
+      FusedConv2DOp<T>);
+
+// If we're using the alternative GEMM-based implementation of Conv2D for the
+// CPU implementation, don't register this EigenTensor-based version.
+#if !defined(USE_GEMM_FOR_CONV)
+TF_CALL_float(REGISTER_FUSED_CONV2D);
+TF_CALL_double(REGISTER_FUSED_CONV2D);
+#endif  // !USE_GEMM_FOR_CONV
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/conv_ops_test.cc b/tensorflow/core/kernels/conv_ops_test.cc
index 1236f270518..6421cad367e 100644
--- a/tensorflow/core/kernels/conv_ops_test.cc
+++ b/tensorflow/core/kernels/conv_ops_test.cc
@@ -522,4 +522,458 @@ TEST_F(ConvOpTest, HandwrittenConv) { HandwrittenConv(); }
 
 TEST_F(ConvOpTest, AnisotropicStride) { AnisotropicStrides(); }
 
+class FusedConv2DOpTest : public OpsTestBase {
+ protected:
+  static constexpr int kDepth = 3;
+  static constexpr int kImageWidth = 32;
+  static constexpr int kImageHeight = 32;
+  static constexpr int kImageBatchCount = 8;
+
+  using GraphRunner =
+      std::function<void(const Tensor& input_data, const Tensor& filter_data,
+                         const Tensor& bias_data, Tensor* out)>;
+
+  // Runs a Tensorflow graph defined by the root scope, and fetches the result
+  // of 'fetch' node into the output Tensor.
+  void RunAndFetch(const tensorflow::Scope& root, const string& fetch,
+                   Tensor* output) {
+    tensorflow::GraphDef graph;
+    TF_ASSERT_OK(root.ToGraphDef(&graph));
+
+    std::unique_ptr<tensorflow::Session> session(
+        tensorflow::NewSession(tensorflow::SessionOptions()));
+    TF_ASSERT_OK(session->Create(graph));
+
+    std::vector<Tensor> unfused_tensors;
+    TF_ASSERT_OK(session->Run({}, {fetch}, {}, &unfused_tensors));
+
+    *output = unfused_tensors[0];
+  }
+
+  void RunConv2DOp(const Tensor& input_data, const Tensor& filter_data,
+                   const Tensor& bias_data, Tensor* output, int stride = 1) {
+    auto root = tensorflow::Scope::NewRootScope();
+
+    auto conv = ops::Conv2D(
+        root.WithOpName("conv"),
+        ops::Const(root.WithOpName("input"), Input::Initializer(input_data)),
+        ops::Const(root.WithOpName("filter"), Input::Initializer(filter_data)),
+        {1, stride, stride, 1}, "SAME");
+
+    auto with_bias = ops::BiasAdd(
+        root.WithOpName("with_bias"), conv,
+        ops::Const(root.WithOpName("bias"), Input::Initializer(bias_data)));
+
+    RunAndFetch(root, "with_bias", output);
+  }
+
+  void RunConv2DWithReluOp(const Tensor& input_data, const Tensor& filter_data,
+                           const Tensor& bias_data, Tensor* output,
+                           int stride = 1) {
+    auto root = tensorflow::Scope::NewRootScope();
+
+    auto conv = ops::Conv2D(
+        root.WithOpName("conv"),
+        ops::Const(root.WithOpName("input"), Input::Initializer(input_data)),
+        ops::Const(root.WithOpName("filter"), Input::Initializer(filter_data)),
+        {1, stride, stride, 1}, "SAME");
+
+    auto with_bias = ops::BiasAdd(
+        root.WithOpName("with_bias"), conv,
+        ops::Const(root.WithOpName("bias"), Input::Initializer(bias_data)));
+
+    auto with_relu = ops::Relu(root.WithOpName("with_relu"), with_bias);
+
+    RunAndFetch(root, "with_relu", output);
+  }
+
+  template <typename T>
+  void RunFusedConv2DOp(const Tensor& image, const Tensor& filter,
+                        const Tensor& bias,
+                        const std::vector<string>& fused_ops, Tensor* output,
+                        int stride = 1) {
+    DataType dtype = DataTypeToEnum<T>::v();
+
+    TF_EXPECT_OK(NodeDefBuilder("fused_conv_op", "_FusedConv2D")
+                     .Input(FakeInput(dtype))
+                     .Input(FakeInput(dtype))
+                     .Attr("num_args", 1)
+                     .Input(FakeInput(dtype))
+                     .Attr("T", dtype)
+                     .Attr("strides", {1, stride, stride, 1})
+                     .Attr("padding", "SAME")
+                     .Attr("fused_ops", fused_ops)
+                     .Finalize(node_def()));
+
+    TF_EXPECT_OK(InitOp());
+
+    AddInputFromArray<T>(image.shape(), image.flat<T>());
+    AddInputFromArray<T>(filter.shape(), filter.flat<T>());
+    AddInputFromArray<T>(bias.shape(), bias.flat<T>());
+    TF_ASSERT_OK(RunOpKernel());
+
+    *output = *GetOutput(0);
+  }
+
+  template <typename T>
+  void VerifyTensorsNear(int depth, int image_width, int image_height,
+                         int image_batch_count, int filter_size,
+                         int filter_count, const GraphRunner& run_default,
+                         const GraphRunner& run_fused) {
+    DataType dtype = DataTypeToEnum<T>::v();
+    Tensor image(dtype, {image_batch_count, image_height, image_width, depth});
+    image.flat<T>() = image.flat<T>().setRandom();
+
+    Tensor filter(dtype, {filter_size, filter_size, depth, filter_count});
+    filter.flat<T>() = filter.flat<T>().setRandom();
+
+    const int bias_size = filter_count;
+    Tensor bias(dtype, {bias_size});
+    bias.flat<T>() = bias.flat<T>().setRandom();
+
+    Tensor conv_2d;
+    Tensor fused_conv_2d;
+
+    run_default(image, filter, bias, &conv_2d);
+    run_fused(image, filter, bias, &fused_conv_2d);
+
+    ASSERT_EQ(conv_2d.dtype(), fused_conv_2d.dtype());
+    ASSERT_EQ(conv_2d.shape(), fused_conv_2d.shape());
+
+    test::ExpectTensorNear<T>(conv_2d, fused_conv_2d, 1e-5);
+  }
+
+  // Verifies that computing Conv2D+BiasAdd in a graph is identical to
+  // FusedConv2D.
+  template <typename T>
+  void VerifyConv2DWithBias(int depth, int image_width, int image_height,
+                            int image_batch_count, int filter_size,
+                            int filter_count) {
+    const GraphRunner run_default =
+        [this](const Tensor& input_data, const Tensor& filter_data,
+               const Tensor& bias_data, Tensor* out) {
+          RunConv2DOp(input_data, filter_data, bias_data, out);
+        };
+
+    const GraphRunner run_fused = [this](const Tensor& input_data,
+                                         const Tensor& filter_data,
+                                         const Tensor& bias_data, Tensor* out) {
+      RunFusedConv2DOp<T>(input_data, filter_data, bias_data, {"BiasAdd"}, out);
+    };
+
+    VerifyTensorsNear<T>(depth, image_width, image_height, image_batch_count,
+                         filter_size, filter_count, run_default, run_fused);
+  }
+
+  // Verifies that computing Conv2D+BiasAdd+Relu in a graph is identical to
+  // FusedConv2D.
+  template <typename T>
+  void VerifyConv2DWithBiasAndRelu(int depth, int image_width, int image_height,
+                                   int image_batch_count, int filter_size,
+                                   int filter_count) {
+    const GraphRunner run_default =
+        [this](const Tensor& input_data, const Tensor& filter_data,
+               const Tensor& bias_data, Tensor* out) {
+          RunConv2DWithReluOp(input_data, filter_data, bias_data, out);
+        };
+
+    const GraphRunner run_fused = [this](const Tensor& input_data,
+                                         const Tensor& filter_data,
+                                         const Tensor& bias_data, Tensor* out) {
+      RunFusedConv2DOp<T>(input_data, filter_data, bias_data,
+                          {"BiasAdd", "Relu"}, out);
+    };
+
+    VerifyTensorsNear<T>(depth, image_width, image_height, image_batch_count,
+                         filter_size, filter_count, run_default, run_fused);
+  }
+};
+
+#define FUSED_CONV2D_TESTS(dtype, name)                                       \
+  TEST_F(FusedConv2DOpTest, Conv2DWithBiasAddOneByOneConvolution##name) {     \
+    const int filter_size = 1;                                                \
+    const int filter_count = 12;                                              \
+                                                                              \
+    VerifyConv2DWithBias<dtype>(kDepth, kImageWidth, kImageHeight,            \
+                                kImageBatchCount, filter_size, filter_count); \
+  }                                                                           \
+                                                                              \
+  TEST_F(FusedConv2DOpTest, Conv2DWithBiasAddImageSizeConvolution##name) {    \
+    const int filter_size = 32;                                               \
+    const int filter_count = 12;                                              \
+                                                                              \
+    VerifyConv2DWithBias<dtype>(kDepth, kImageWidth, kImageHeight,            \
+                                kImageBatchCount, filter_size, filter_count); \
+  }                                                                           \
+                                                                              \
+  TEST_F(FusedConv2DOpTest, Conv2DWithBiasAddSpatialConvolution##name) {      \
+    const int filter_size = 3;                                                \
+    const int filter_count = 12;                                              \
+                                                                              \
+    VerifyConv2DWithBias<dtype>(kDepth, kImageWidth, kImageHeight,            \
+                                kImageBatchCount, filter_size, filter_count); \
+  }                                                                           \
+                                                                              \
+  TEST_F(FusedConv2DOpTest,                                                   \
+         Conv2DWithBiasAddAndReluOneByOneConvolution##name) {                 \
+    const int filter_size = 1;                                                \
+    const int filter_count = 12;                                              \
+                                                                              \
+    VerifyConv2DWithBiasAndRelu<dtype>(kDepth, kImageWidth, kImageHeight,     \
+                                       kImageBatchCount, filter_size,         \
+                                       filter_count);                         \
+  }                                                                           \
+                                                                              \
+  TEST_F(FusedConv2DOpTest,                                                   \
+         Conv2DWithBiasAddAndReluImageSizeConvolution##name) {                \
+    const int filter_size = 32;                                               \
+    const int filter_count = 12;                                              \
+                                                                              \
+    VerifyConv2DWithBiasAndRelu<dtype>(kDepth, kImageWidth, kImageHeight,     \
+                                       kImageBatchCount, filter_size,         \
+                                       filter_count);                         \
+  }                                                                           \
+                                                                              \
+  TEST_F(FusedConv2DOpTest,                                                   \
+         Conv2DWithBiasAddAndReluSpatialConvolution##name) {                  \
+    const int filter_size = 3;                                                \
+    const int filter_count = 12;                                              \
+                                                                              \
+    VerifyConv2DWithBiasAndRelu<dtype>(kDepth, kImageWidth, kImageHeight,     \
+                                       kImageBatchCount, filter_size,         \
+                                       filter_count);                         \
+  }
+
+FUSED_CONV2D_TESTS(float, F);
+FUSED_CONV2D_TESTS(double, D);
+
+#undef FUSED_CONV2D_TESTS
+
+////////////////////////////////////////////////////////////////////////////////
+// Performance benchmarks for the FusedConv2DWithBiasOp.                      //
+////////////////////////////////////////////////////////////////////////////////
+
+struct Conv2DGraph {
+  Graph* graph;
+  Node* conv2d;
+};
+
+struct Conv2DWithBiasGraph {
+  Graph* graph;
+  Node* conv2d;
+  Node* bias;
+};
+
+struct Conv2DWithBiasAndReluGraph {
+  Graph* graph;
+  Node* conv2d;
+  Node* bias;
+  Node* relu;
+};
+
+static Tensor MakeRandomTensor(const TensorShape& shape) {
+  Tensor tensor(DT_FLOAT, TensorShape(shape));
+  tensor.flat<float>() = tensor.flat<float>().setRandom();
+  return tensor;
+}
+
+// Creates a simple Tensorflow graph with single Conv2D node.
+static Conv2DGraph Conv2D(int batch, int height, int width, int in_depth,
+                          int filter_w, int filter_h, int out_depth) {
+  Graph* graph = new Graph(OpRegistry::Global());
+
+  Tensor images_t = MakeRandomTensor({batch, height, width, in_depth});
+  Tensor filter_t = MakeRandomTensor({filter_w, filter_h, in_depth, out_depth});
+
+  Node* images = test::graph::Constant(graph, images_t, "images");
+  Node* filter = test::graph::Constant(graph, filter_t, "filter");
+
+  Node* conv2d;
+  TF_CHECK_OK(NodeBuilder(graph->NewName("conv"), "Conv2D")
+                  .Input(images)
+                  .Input(filter)
+                  .Attr("T", DT_FLOAT)
+                  .Attr("strides", {1, 1, 1, 1})
+                  .Attr("padding", "SAME")
+                  .Finalize(graph, &conv2d));
+
+  return {graph, conv2d};
+}
+
+// Creates a Tensorflow graph with a Conv2D node followed by Relu.
+static Conv2DWithBiasGraph Conv2DWithBias(int batch, int height, int width,
+                                          int in_depth, int filter_w,
+                                          int filter_h, int out_depth) {
+  Conv2DGraph conv_graph =
+      Conv2D(batch, height, width, in_depth, filter_w, filter_h, out_depth);
+
+  Graph* graph = conv_graph.graph;
+  Node* conv2d = conv_graph.conv2d;
+
+  Tensor bias_t = MakeRandomTensor({out_depth});
+  Node* bias = test::graph::Constant(graph, bias_t, "bias");
+
+  Node* out;
+  TF_CHECK_OK(NodeBuilder(graph->NewName("bias"), "BiasAdd")
+                  .Input(conv2d)
+                  .Input(bias)
+                  .Attr("T", DT_FLOAT)
+                  .Attr("data_format", "NHWC")
+                  .Finalize(graph, &out));
+
+  return {graph, conv2d, out};
+}
+
+// Creates a Tensorflow graph with a Conv2D node followed by BiasAdd and Relu.
+static Conv2DWithBiasAndReluGraph Conv2DWithBiasAndRelu(int batch, int height,
+                                                        int width, int in_depth,
+                                                        int filter_w,
+                                                        int filter_h,
+                                                        int out_depth) {
+  Conv2DWithBiasGraph conv_graph = Conv2DWithBias(
+      batch, height, width, in_depth, filter_w, filter_h, out_depth);
+
+  Graph* graph = conv_graph.graph;
+  Node* conv2d = conv_graph.conv2d;
+  Node* bias = conv_graph.bias;
+
+  Node* relu;
+  TF_CHECK_OK(NodeBuilder(graph->NewName("relu"), "Relu")
+                  .Input(bias)
+                  .Attr("T", DT_FLOAT)
+                  .Finalize(graph, &relu));
+
+  return {graph, conv2d, bias, relu};
+}
+
+// Creates a tensorflow graph with a single FusedConv2D node and fuses into it
+// additional computations (e.g. BiasAdd or Relu).
+static Graph* FusedConv2D(int batch, int height, int width, int in_depth,
+                          int filter_w, int filter_h, int out_depth,
+                          const std::vector<string>& fused_ops = {}) {
+  Graph* graph = new Graph(OpRegistry::Global());
+
+  Tensor images_t = MakeRandomTensor({batch, height, width, in_depth});
+  Tensor filter_t = MakeRandomTensor({filter_w, filter_h, in_depth, out_depth});
+  Tensor bias_t = MakeRandomTensor({out_depth});
+
+  Node* images = test::graph::Constant(graph, images_t, "images");
+  Node* filter = test::graph::Constant(graph, filter_t, "filter");
+  Node* bias = test::graph::Constant(graph, bias_t, "bias");
+
+  std::vector<NodeBuilder::NodeOut> args = {bias};
+
+  Node* conv;
+  TF_CHECK_OK(NodeBuilder(graph->NewName("conv"), "_FusedConv2D")
+                  .Input(images)
+                  .Input(filter)
+                  .Attr("num_args", 1)
+                  .Input(args)
+                  .Attr("T", DT_FLOAT)
+                  .Attr("strides", {1, 1, 1, 1})
+                  .Attr("padding", "SAME")
+                  .Attr("fused_ops", fused_ops)
+                  .Finalize(graph, &conv));
+
+  return graph;
+}
+
+#define BM_SETUP(N, H, W, C, type, LABEL, NAME)                               \
+  testing::ItemsProcessed(static_cast<int64>(iters) * (N) * (H) * (W) * (C)); \
+  testing::SetLabel(LABEL);
+
+#define BM_NAME(name, type, N, H, W, C, FW, FH, FC) \
+  name##_##type##_##N##_##H##_##W##_##C##_##FW##_##FH##_##FC
+
+#define BM_Conv2D(N, H, W, C, FW, FH, FC, type, LABEL)                       \
+  static void BM_NAME(BM_Conv2D, type, N, H, W, C, FW, FH, FC)(int iters) {  \
+    BM_SETUP(N, H, W, C, type, LABEL, Conv2D);                               \
+    test::Benchmark(#type, Conv2D(N, H, W, C, FW, FH, FC).graph).Run(iters); \
+  }                                                                          \
+  BENCHMARK(BM_NAME(BM_Conv2D, type, N, H, W, C, FW, FH, FC));
+
+#define BM_Conv2DWithBias(N, H, W, C, FW, FH, FC, type, LABEL)           \
+  static void BM_NAME(BM_Conv2DWithBias, type, N, H, W, C, FW, FH,       \
+                      FC)(int iters) {                                   \
+    BM_SETUP(N, H, W, C, type, LABEL, Conv2D);                           \
+    test::Benchmark(#type, Conv2DWithBias(N, H, W, C, FW, FH, FC).graph) \
+        .Run(iters);                                                     \
+  }                                                                      \
+  BENCHMARK(BM_NAME(BM_Conv2DWithBias, type, N, H, W, C, FW, FH, FC));
+
+#define BM_Conv2DWithBiasAndRelu(N, H, W, C, FW, FH, FC, type, LABEL)     \
+  static void BM_NAME(BM_Conv2DWithBiasAndRelu, type, N, H, W, C, FW, FH, \
+                      FC)(int iters) {                                    \
+    BM_SETUP(N, H, W, C, type, LABEL, Conv2D);                            \
+    test::Benchmark(#type,                                                \
+                    Conv2DWithBiasAndRelu(N, H, W, C, FW, FH, FC).graph)  \
+        .Run(iters);                                                      \
+  }                                                                       \
+  BENCHMARK(BM_NAME(BM_Conv2DWithBiasAndRelu, type, N, H, W, C, FW, FH, FC));
+
+#define BM_FusedConv2D(N, H, W, C, FW, FH, FC, type, LABEL)                  \
+  static void BM_NAME(BM_FusedConv2D, type, N, H, W, C, FW, FH,              \
+                      FC)(int iters) {                                       \
+    BM_SETUP(N, H, W, C, type, LABEL, Conv2D);                               \
+    test::Benchmark(#type, FusedConv2D(N, H, W, C, FW, FH, FC, {"BiasAdd"})) \
+        .Run(iters);                                                         \
+  }                                                                          \
+  BENCHMARK(BM_NAME(BM_FusedConv2D, type, N, H, W, C, FW, FH, FC));
+
+#define BM_FusedConv2DAndRelu(N, H, W, C, FW, FH, FC, type, LABEL)            \
+  static void BM_NAME(BM_FusedConv2DAndRelu, type, N, H, W, C, FW, FH,        \
+                      FC)(int iters) {                                        \
+    BM_SETUP(N, H, W, C, type, LABEL, Conv2D);                                \
+    test::Benchmark(#type,                                                    \
+                    FusedConv2D(N, H, W, C, FW, FH, FC, {"BiasAdd", "Relu"})) \
+        .Run(iters);                                                          \
+  }                                                                           \
+  BENCHMARK(BM_NAME(BM_FusedConv2DAndRelu, type, N, H, W, C, FW, FH, FC));
+
+// Pixel CNN convolutions.
+
+// 1x1 Convolution: MatMulFunctor
+
+BM_Conv2D(8, 32, 32, 128, 1, 1, 1024, cpu, "1x1 /b 8");
+BM_Conv2D(16, 32, 32, 128, 1, 1, 1024, cpu, "1x1 /b 16");
+BM_Conv2D(32, 32, 32, 128, 1, 1, 1024, cpu, "1x1 /b 32");
+
+BM_Conv2DWithBias(8, 32, 32, 128, 1, 1, 1024, cpu, "1x1 /b 8");
+BM_Conv2DWithBias(16, 32, 32, 128, 1, 1, 1024, cpu, "1x1 /b 16");
+BM_Conv2DWithBias(32, 32, 32, 128, 1, 1, 1024, cpu, "1x1 /b 32");
+
+BM_Conv2DWithBiasAndRelu(8, 32, 32, 128, 1, 1, 1024, cpu, "1x1 /b 8");
+BM_Conv2DWithBiasAndRelu(16, 32, 32, 128, 1, 1, 1024, cpu, "1x1 /b 16");
+BM_Conv2DWithBiasAndRelu(32, 32, 32, 128, 1, 1, 1024, cpu, "1x1 /b 32");
+
+BM_FusedConv2D(8, 32, 32, 128, 1, 1, 1024, cpu, "1x1 /b 8");
+BM_FusedConv2D(16, 32, 32, 128, 1, 1, 1024, cpu, "1x1 /b 16");
+BM_FusedConv2D(32, 32, 32, 128, 1, 1, 1024, cpu, "1x1 /b 32");
+
+BM_FusedConv2DAndRelu(8, 32, 32, 128, 1, 1, 1024, cpu, "1x1 /b 8");
+BM_FusedConv2DAndRelu(16, 32, 32, 128, 1, 1, 1024, cpu, "1x1 /b 16");
+BM_FusedConv2DAndRelu(32, 32, 32, 128, 1, 1, 1024, cpu, "1x1 /b 32");
+
+// 3x3 Convolution: SpatialConvolution
+
+BM_Conv2D(8, 32, 32, 128, 3, 3, 1024, cpu, "3x3 /b 8");
+BM_Conv2D(16, 32, 32, 128, 3, 3, 1024, cpu, "3x3 /b 16");
+BM_Conv2D(32, 32, 32, 128, 3, 3, 1024, cpu, "3x3 /b 32");
+
+BM_Conv2DWithBias(8, 32, 32, 128, 3, 3, 1024, cpu, "3x3 /b 8");
+BM_Conv2DWithBias(16, 32, 32, 128, 3, 3, 1024, cpu, "3x3 /b 16");
+BM_Conv2DWithBias(32, 32, 32, 128, 3, 3, 1024, cpu, "3x3 /b 32");
+
+BM_Conv2DWithBiasAndRelu(8, 32, 32, 128, 3, 3, 1024, cpu, "3x3 /b 8");
+BM_Conv2DWithBiasAndRelu(16, 32, 32, 128, 3, 3, 1024, cpu, "3x3 /b 16");
+BM_Conv2DWithBiasAndRelu(32, 32, 32, 128, 3, 3, 1024, cpu, "3x3 /b 32");
+
+BM_FusedConv2D(8, 32, 32, 128, 3, 3, 1024, cpu, "3x3 /b 8");
+BM_FusedConv2D(16, 32, 32, 128, 3, 3, 1024, cpu, "3x3 /b 16");
+BM_FusedConv2D(32, 32, 32, 128, 3, 3, 1024, cpu, "3x3 /b 32");
+
+BM_FusedConv2DAndRelu(8, 32, 32, 128, 3, 3, 1024, cpu, "3x3 /b 8");
+BM_FusedConv2DAndRelu(16, 32, 32, 128, 3, 3, 1024, cpu, "3x3 /b 16");
+BM_FusedConv2DAndRelu(32, 32, 32, 128, 3, 3, 1024, cpu, "3x3 /b 32");
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_ops.h b/tensorflow/core/kernels/cwise_ops.h
index 66ba827a901..3f7aa0dc399 100644
--- a/tensorflow/core/kernels/cwise_ops.h
+++ b/tensorflow/core/kernels/cwise_ops.h
@@ -656,7 +656,7 @@ template <typename T>
 struct erfc : base<T, Eigen::internal::scalar_erfc_op<T>> {};
 
 template <typename T>
-struct sigmoid : base<T, Eigen::internal::scalar_sigmoid_op<T>> {};
+struct sigmoid : base<T, Eigen::internal::scalar_logistic_op<T>> {};
 
 template <typename T>
 struct sin : base<T, Eigen::internal::scalar_sin_op<T>> {};
diff --git a/tensorflow/core/kernels/data/BUILD b/tensorflow/core/kernels/data/BUILD
index 9993b4d180c..b7ccf5f70ec 100644
--- a/tensorflow/core/kernels/data/BUILD
+++ b/tensorflow/core/kernels/data/BUILD
@@ -658,6 +658,7 @@ tf_kernel_library(
         "//tensorflow/core/grappler/clusters:virtual_cluster",
         "//tensorflow/core/grappler/optimizers:meta_optimizer",
         "//tensorflow/core/grappler/optimizers/data",
+        "//tensorflow/core/grappler/optimizers/data:graph_utils",
     ],
 )
 
diff --git a/tensorflow/core/kernels/data/batch_dataset_op.cc b/tensorflow/core/kernels/data/batch_dataset_op.cc
index cbcae0588c6..41b04346ebd 100644
--- a/tensorflow/core/kernels/data/batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/batch_dataset_op.cc
@@ -174,6 +174,11 @@ class BatchDatasetOp : public UnaryDatasetOpKernel {
           batch_component_shape.AppendShape(first_element.shape());
           out_tensors->emplace_back(ctx->allocator({}), first_element.dtype(),
                                     batch_component_shape);
+          if (!out_tensors->back().IsInitialized()) {
+            return errors::ResourceExhausted(
+                "Failed to allocate memory for the batch of component ",
+                component_index);
+          }
           Tensor& batch_component = out_tensors->back();
           // Build the output tuple component by copying one slice
           // from each input element in the batch.
diff --git a/tensorflow/core/kernels/data/captured_function.cc b/tensorflow/core/kernels/data/captured_function.cc
index d36eec0646b..64834e507f2 100644
--- a/tensorflow/core/kernels/data/captured_function.cc
+++ b/tensorflow/core/kernels/data/captured_function.cc
@@ -79,12 +79,7 @@ class SimpleStepStatsCollector : public StepStatsCollectorInterface {
 
     bool TrackAllocations() const override { return false; }
 
-    void SetMemory(OpKernelContext* ctx) override {
-      // Returning `false` from `TrackAllocations()` should prevent
-      // `TrackingAllocator` objects from being constructed.
-      DCHECK_EQ(0, ctx->wrapped_allocators().size())
-          << "Allocations were tracked but should not have been requested.";
-    }
+    void SetMemory(OpKernelContext* ctx) override {}
 
     void SetOutput(int slot, const Tensor* tensor) override {}
 
diff --git a/tensorflow/core/kernels/data/experimental/numa_map_and_batch_dataset_op.cc b/tensorflow/core/kernels/data/experimental/numa_map_and_batch_dataset_op.cc
index 677141a89d6..068f8540230 100644
--- a/tensorflow/core/kernels/data/experimental/numa_map_and_batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/numa_map_and_batch_dataset_op.cc
@@ -201,7 +201,7 @@ class NumaMapAndBatchDatasetOp : public UnaryDatasetOpKernel {
       Status Initialize(IteratorContext* ctx) override {
         mutex_lock l(*mu_);
         if (num_parallel_calls_->value == kAutoTune) {
-          num_parallel_calls_->value = port::NumSchedulableCPUs();
+          num_parallel_calls_->value = ctx->runner_threadpool_size();
           num_parallel_calls_->tunable = true;
         }
         TF_RETURN_IF_ERROR(
@@ -244,7 +244,7 @@ class NumaMapAndBatchDatasetOp : public UnaryDatasetOpKernel {
         return model::MakeAsyncKnownRatioNode(
             std::move(args), dataset()->batch_size_,
             {model::MakeParameter("parallelism", num_parallel_calls_, /*min=*/1,
-                                  /*max=*/port::NumSchedulableCPUs())});
+                                  /*max=*/ctx->runner_threadpool_size())});
       }
 
       Status SaveInternal(IteratorStateWriter* writer) override {
diff --git a/tensorflow/core/kernels/data/experimental/threadpool_dataset_op.cc b/tensorflow/core/kernels/data/experimental/threadpool_dataset_op.cc
index 64715aee2e2..ab21dfc6bc5 100644
--- a/tensorflow/core/kernels/data/experimental/threadpool_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/threadpool_dataset_op.cc
@@ -47,6 +47,8 @@ class ThreadPoolResource : public ResourceBase {
     }
   }
 
+  int32 NumThreads() { return thread_pool_.NumThreads(); }
+
   string DebugString() override { return "ThreadPoolResource"; }
 
  private:
@@ -192,18 +194,13 @@ class ThreadPoolDatasetOp : public UnaryDatasetOpKernel {
                              std::vector<Tensor>* out_tensors,
                              bool* end_of_sequence) override {
         ThreadPoolResource* pool = dataset()->threadpool_;
-        IteratorContext::Params params;
-        params.env = ctx->env();
+        IteratorContext::Params params(ctx);
         params.runner = [pool](std::function<void()> c) {
           pool->Schedule(std::move(c));
         };
-        params.stats_aggregator = ctx->stats_aggregator();
-        params.lib = ctx->lib();
-        params.function_library = ctx->function_library();
-        params.allocator_getter = ctx->allocator_getter();
-        IteratorContext threadpool_ctx(params);
-        return input_impl_->GetNext(&threadpool_ctx, out_tensors,
-                                    end_of_sequence);
+        params.runner_threadpool_size = pool->NumThreads();
+        IteratorContext iter_ctx(params);
+        return input_impl_->GetNext(&iter_ctx, out_tensors, end_of_sequence);
       }
 
      protected:
diff --git a/tensorflow/core/kernels/data/flat_map_dataset_op.cc b/tensorflow/core/kernels/data/flat_map_dataset_op.cc
index d858b4d698d..9b42981ed75 100644
--- a/tensorflow/core/kernels/data/flat_map_dataset_op.cc
+++ b/tensorflow/core/kernels/data/flat_map_dataset_op.cc
@@ -247,16 +247,6 @@ class FlatMapDatasetOp : public UnaryDatasetOpKernel {
             &current_element_iterator_);
       }
 
-      Status BuildCurrentElementIteratorLocked(OpKernelContext* ctx)
-          EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-        IteratorContext::Params params;
-        params.env = ctx->env();
-        params.runner = *(ctx->runner());
-        params.lib = ctx->function_library();
-        IteratorContext iter_ctx(std::move(params));
-        return BuildCurrentElementIteratorLocked(&iter_ctx);
-      }
-
       mutex mu_;
       size_t element_index_ GUARDED_BY(mu_) = 0;
       std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
diff --git a/tensorflow/core/kernels/data/iterator_ops.cc b/tensorflow/core/kernels/data/iterator_ops.cc
index 459dc28ee46..445718ba1e5 100644
--- a/tensorflow/core/kernels/data/iterator_ops.cc
+++ b/tensorflow/core/kernels/data/iterator_ops.cc
@@ -68,8 +68,10 @@ class IteratorResource : public ResourceBase {
     std::shared_ptr<IteratorBase> captured_iterator(iterator_);
     if (captured_iterator) {
       CHECK_NOTNULL(lib_);
-      ctx->set_lib(lib_);
-      return captured_iterator->GetNext(ctx, out_tensors, end_of_sequence);
+      IteratorContext::Params params(ctx);
+      params.lib = lib_;
+      return captured_iterator->GetNext(IteratorContext(std::move(params)),
+                                        out_tensors, end_of_sequence);
     } else {
       return errors::FailedPrecondition(
           "GetNext() failed because the iterator has not been initialized. "
@@ -78,6 +80,11 @@ class IteratorResource : public ResourceBase {
     }
   }
 
+  Status GetNext(IteratorContext&& ctx, std::vector<Tensor>* out_tensors,
+                 bool* end_of_sequence) {
+    return GetNext(&ctx, out_tensors, end_of_sequence);
+  }
+
   Status Save(SerializationContext* ctx, IteratorStateWriter* writer) {
     std::shared_ptr<IteratorBase> captured_iterator(iterator_);
     if (captured_iterator) {
@@ -124,24 +131,21 @@ class IteratorResource : public ResourceBase {
     TF_RETURN_IF_ERROR(GetDatasetFromVariantTensor(outputs[0], &dataset));
 
     std::unique_ptr<IteratorBase> iterator;
-    IteratorContext iter_ctx(ctx);
-    iter_ctx.set_lib(lib);
-    TF_RETURN_IF_ERROR(
-        dataset->MakeIterator(std::move(iter_ctx), "Iterator", &iterator));
+    IteratorContext::Params params(ctx);
+    params.lib = lib;
+    TF_RETURN_IF_ERROR(dataset->MakeIterator(IteratorContext(std::move(params)),
+                                             "Iterator", &iterator));
     TF_RETURN_IF_ERROR(set_iterator(std::move(iterator)));
     std::shared_ptr<IteratorBase> captured_iterator(iterator_);
 
     if (captured_iterator) {
-      IteratorContext::Params params;
-      params.env = ctx->env();
-      params.runner = *(ctx->runner());
+      IteratorContext::Params params(ctx);
       params.lib = lib;
       DeviceBase* device = lib->device();
       params.allocator_getter = [device](AllocatorAttributes attrs) {
         return device->GetAllocator(attrs);
       };
       IteratorContext iter_ctx(std::move(params));
-
       TF_RETURN_IF_ERROR(captured_iterator->Restore(&iter_ctx, reader));
       mutex_lock l(mu_);
       device_mgr_ = std::move(device_mgr);
@@ -582,10 +586,10 @@ void MakeIteratorOp::Compute(OpKernelContext* ctx) {
   core::ScopedUnref unref(iterator_resource);
 
   std::unique_ptr<IteratorBase> iterator;
-  IteratorContext iter_ctx(ctx);
-  iter_ctx.set_lib(iterator_resource->function_library_runtime());
-  OP_REQUIRES_OK(
-      ctx, dataset->MakeIterator(std::move(iter_ctx), "Iterator", &iterator));
+  IteratorContext::Params params(ctx);
+  params.lib = iterator_resource->function_library_runtime();
+  OP_REQUIRES_OK(ctx, dataset->MakeIterator(IteratorContext(std::move(params)),
+                                            "Iterator", &iterator));
   OP_REQUIRES_OK(ctx, iterator_resource->set_iterator(std::move(iterator)));
 }
 
@@ -913,10 +917,10 @@ class OneShotIteratorOp : public AsyncOpKernel {
     DatasetBase* dataset;
     TF_RETURN_IF_ERROR(GetDatasetFromVariantTensor(return_values[0], &dataset));
     std::unique_ptr<IteratorBase> iter;
-    IteratorContext iter_ctx(ctx);
-    iter_ctx.set_lib(lib);
-    TF_RETURN_IF_ERROR(
-        dataset->MakeIterator(std::move(iter_ctx), "Iterator", &iter));
+    IteratorContext::Params params(ctx);
+    params.lib = lib;
+    TF_RETURN_IF_ERROR(dataset->MakeIterator(IteratorContext(std::move(params)),
+                                             "Iterator", &iter));
     TF_RETURN_IF_ERROR((*iterator)->set_iterator(std::move(iter)));
 
     (*iterator)->Ref();
@@ -972,17 +976,10 @@ void IteratorGetNextOp::ComputeAsync(OpKernelContext* ctx, DoneCallback done) {
         std::vector<Tensor> components;
         bool end_of_sequence = false;
 
-        IteratorContext::Params params;
-        params.env = ctx->env();
-        params.runner = *(ctx->runner());
+        IteratorContext::Params params(ctx);
         params.function_library = iterator->function_library();
-        DeviceBase* device = ctx->function_library()->device();
-        params.allocator_getter = [device](AllocatorAttributes attrs) {
-          return device->GetAllocator(attrs);
-        };
-        IteratorContext iter_ctx(std::move(params));
-
-        Status s = iterator->GetNext(&iter_ctx, &components, &end_of_sequence);
+        Status s = iterator->GetNext(IteratorContext(std::move(params)),
+                                     &components, &end_of_sequence);
         // NOTE(mrry): We must unref the iterator before calling `done()`, to
         // avoid destruction races.
         iterator->Unref();
@@ -1006,22 +1003,12 @@ void IteratorGetNextSyncOp::Compute(OpKernelContext* ctx) {
   IteratorResource* iterator;
   OP_REQUIRES_OK(ctx, LookupResource(ctx, HandleFromInput(ctx, 0), &iterator));
   core::ScopedUnref unref_iterator(iterator);
-
   std::vector<Tensor> components;
   bool end_of_sequence = false;
-
-  IteratorContext::Params params;
-  params.env = ctx->env();
-  params.runner = *(ctx->runner());
+  IteratorContext::Params params(ctx);
   params.function_library = iterator->function_library();
-  DeviceBase* device = ctx->function_library()->device();
-  params.allocator_getter = [device](AllocatorAttributes attrs) {
-    return device->GetAllocator(attrs);
-  };
-  IteratorContext iter_ctx(std::move(params));
-
-  OP_REQUIRES_OK(ctx,
-                 iterator->GetNext(&iter_ctx, &components, &end_of_sequence));
+  OP_REQUIRES_OK(ctx, iterator->GetNext(IteratorContext(std::move(params)),
+                                        &components, &end_of_sequence));
   OP_REQUIRES(ctx, !end_of_sequence, errors::OutOfRange("End of sequence"));
 
   for (int i = 0; i < components.size(); ++i) {
@@ -1054,18 +1041,10 @@ class IteratorGetNextAsOptionalOp : public AsyncOpKernel {
           std::vector<Tensor> components;
           bool end_of_sequence = false;
 
-          IteratorContext::Params params;
-          params.env = ctx->env();
-          params.runner = *(ctx->runner());
+          IteratorContext::Params params(ctx);
           params.function_library = iterator->function_library();
-          DeviceBase* device = ctx->function_library()->device();
-          params.allocator_getter = [device](AllocatorAttributes attrs) {
-            return device->GetAllocator(attrs);
-          };
-          IteratorContext iter_ctx(std::move(params));
-
-          Status s =
-              iterator->GetNext(&iter_ctx, &components, &end_of_sequence);
+          Status s = iterator->GetNext(IteratorContext(std::move(params)),
+                                       &components, &end_of_sequence);
           // NOTE(mrry): We must unref the iterator before calling `done()`, to
           // avoid destruction races.
           iterator->Unref();
diff --git a/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc b/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc
index 7790d133203..72a401e99b8 100644
--- a/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/data/dataset_utils.h"
 #include "tensorflow/core/kernels/inplace_ops_functor.h"
 #include "tensorflow/core/lib/core/blocking_counter.h"
+#include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/lib/random/random.h"
 #include "tensorflow/core/lib/strings/strcat.h"
@@ -262,9 +263,7 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
       Status Initialize(IteratorContext* ctx) override {
         mutex_lock l(*mu_);
         if (num_parallel_calls_->value == kAutoTune) {
-          // TODO(jsimsa): Surface the number of threads used by `ctx->runner()`
-          // and use it here for the default.
-          num_parallel_calls_->value = port::NumSchedulableCPUs();
+          num_parallel_calls_->value = ctx->runner_threadpool_size();
           num_parallel_calls_->tunable = true;
         }
         TF_RETURN_IF_ERROR(
@@ -298,7 +297,7 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
         return model::MakeAsyncKnownRatioNode(
             std::move(args), dataset()->batch_size_,
             {model::MakeParameter("parallelism", num_parallel_calls_, /*min=*/1,
-                                  /*max=*/port::NumSchedulableCPUs())});
+                                  /*max=*/ctx->runner_threadpool_size())});
       }
 
       Status SaveInternal(IteratorStateWriter* writer) override {
@@ -414,32 +413,36 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
         auto done = [this, ctx, result, return_values, offset](Status status) {
           result->UpdateStatus(status, offset);
           if (status.ok()) {
-            EnsureOutputAllocated(ctx, result, return_values);
-            for (size_t i = 0; i < return_values->size(); ++i) {
-              const Tensor& tensor = return_values->at(i);
-              Tensor* batch = &(result->output)[i];
-              if (tensor.NumElements() !=
-                  (batch->NumElements() / batch->dim_size(0))) {
-                TensorShape batch_shape = batch->shape();
-                batch_shape.RemoveDim(0);
-                result->UpdateStatus(
-                    errors::InvalidArgument(
-                        "Cannot add tensor to the batch: number of elements "
-                        "does "
-                        "not match. Shapes are: [tensor]: ",
-                        tensor.shape().DebugString(),
-                        ", [batch]: ", batch_shape.DebugString()),
-                    offset);
-                break;
-              }
-              // TODO(mrry): Add a version of DoParallelConcat that allows us to
-              // move `tensor` where possible, to speed up string tensor
-              // batching.
-              Status copy_status = ::tensorflow::functor::DoParallelConcat(
-                  *dataset()->device_, tensor, offset, batch);
-              if (!copy_status.ok()) {
-                result->UpdateStatus(copy_status, offset);
-                break;
+            Status allocate_status =
+                EnsureOutputAllocated(ctx, result, return_values);
+            if (!allocate_status.ok()) {
+              result->UpdateStatus(allocate_status, offset);
+            } else {
+              for (size_t i = 0; i < return_values->size(); ++i) {
+                const Tensor& tensor = return_values->at(i);
+                Tensor* batch = &(result->output)[i];
+                if (tensor.NumElements() !=
+                    (batch->NumElements() / batch->dim_size(0))) {
+                  TensorShape batch_shape = batch->shape();
+                  batch_shape.RemoveDim(0);
+                  result->UpdateStatus(
+                      errors::InvalidArgument(
+                          "Cannot add tensor to the batch: number of elements "
+                          "does not match. Shapes are: [tensor]: ",
+                          tensor.shape().DebugString(),
+                          ", [batch]: ", batch_shape.DebugString()),
+                      offset);
+                  break;
+                }
+                // TODO(mrry): Add a version of DoParallelConcat that allows us
+                // to move `tensor` where possible, to speed up string tensor
+                // batching.
+                Status copy_status = ::tensorflow::functor::DoParallelConcat(
+                    *dataset()->device_, tensor, offset, batch);
+                if (!copy_status.ok()) {
+                  result->UpdateStatus(copy_status, offset);
+                  break;
+                }
               }
             }
             {
@@ -487,13 +490,13 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
         }
       }
 
-      void EnsureOutputAllocated(
+      Status EnsureOutputAllocated(
           const std::shared_ptr<IteratorContext>& ctx,
           const std::shared_ptr<BatchResult>& result,
           const std::shared_ptr<std::vector<Tensor>>& return_values) {
         mutex_lock l(result->mu);
         if (result->output_allocated) {
-          return;
+          return Status::OK();
         }
         const size_t num_components = return_values->size();
         for (size_t i = 0; i < num_components; ++i) {
@@ -504,8 +507,13 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
           result->output.emplace_back(ctx->allocator(attr),
                                       return_values->at(i).dtype(),
                                       component_shape);
+          if (!result->output.back().IsInitialized()) {
+            return errors::ResourceExhausted(
+                "Failed to allocate memory for the batch of component ", i);
+          }
         }
         result->output_allocated = true;
+        return Status::OK();
       }
 
       Status ProcessResult(IteratorContext* ctx,
diff --git a/tensorflow/core/kernels/data/map_dataset_op.cc b/tensorflow/core/kernels/data/map_dataset_op.cc
index d64114e70e5..ab20b832986 100644
--- a/tensorflow/core/kernels/data/map_dataset_op.cc
+++ b/tensorflow/core/kernels/data/map_dataset_op.cc
@@ -244,6 +244,11 @@ class MapDatasetOp : public UnaryDatasetOpKernel {
 };
 
 REGISTER_KERNEL_BUILDER(Name("MapDataset").Device(DEVICE_CPU), MapDatasetOp);
+REGISTER_KERNEL_BUILDER(Name("ExperimentalMapDataset")
+                            .Device(DEVICE_GPU)
+                            .HostMemory("input_dataset")
+                            .HostMemory("handle"),
+                        MapDatasetOp);
 
 }  // namespace
 }  // namespace data
diff --git a/tensorflow/core/kernels/data/model_dataset_op.cc b/tensorflow/core/kernels/data/model_dataset_op.cc
index 56a45b12049..dcd23095968 100644
--- a/tensorflow/core/kernels/data/model_dataset_op.cc
+++ b/tensorflow/core/kernels/data/model_dataset_op.cc
@@ -86,19 +86,23 @@ class ModelDatasetOp : public UnaryDatasetOpKernel {
       }
 
       Status Initialize(IteratorContext* ctx) override {
-        IteratorContext ctx_with_model(CreateParams(ctx));
-        return dataset()->input_->MakeIterator(&ctx_with_model, prefix(),
-                                               &input_impl_);
+        IteratorContext::Params params(ctx);
+        params.model = model_;
+        return dataset()->input_->MakeIterator(
+            IteratorContext(std::move(params)), prefix(), &input_impl_);
       }
 
       Status GetNextInternal(IteratorContext* ctx,
                              std::vector<Tensor>* out_tensors,
                              bool* end_of_sequence) override {
-        mutex_lock l(mu_);
-        TF_RETURN_IF_ERROR(EnsureOptimizeThreadStarted(ctx));
-        IteratorContext ctx_with_model(CreateParams(ctx));
-        return input_impl_->GetNext(&ctx_with_model, out_tensors,
-                                    end_of_sequence);
+        IteratorContext::Params params(ctx);
+        {
+          mutex_lock l(mu_);
+          TF_RETURN_IF_ERROR(EnsureOptimizeThreadStarted(ctx));
+          params.model = model_;
+        }
+        return input_impl_->GetNext(IteratorContext(std::move(params)),
+                                    out_tensors, end_of_sequence);
       }
 
      protected:
@@ -121,12 +125,6 @@ class ModelDatasetOp : public UnaryDatasetOpKernel {
         return Status::OK();
       }
 
-      IteratorContext::Params CreateParams(IteratorContext* ctx) {
-        IteratorContext::Params params = ctx->params();
-        params.model = model_;
-        return params;
-      }
-
      private:
       Status EnsureOptimizeThreadStarted(IteratorContext* ctx)
           EXCLUSIVE_LOCKS_REQUIRED(mu_) {
@@ -175,7 +173,7 @@ class ModelDatasetOp : public UnaryDatasetOpKernel {
       std::shared_ptr<model::Model> model_;
       std::unique_ptr<Thread> optimize_thread_ GUARDED_BY(mu_);
       bool cancelled_ GUARDED_BY(mu_) = false;
-      std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
+      std::unique_ptr<IteratorBase> input_impl_;
     };
 
     const DatasetBase* input_;
diff --git a/tensorflow/core/kernels/data/multi_device_iterator_ops.cc b/tensorflow/core/kernels/data/multi_device_iterator_ops.cc
index 4d4f8c01640..5268007e3d9 100644
--- a/tensorflow/core/kernels/data/multi_device_iterator_ops.cc
+++ b/tensorflow/core/kernels/data/multi_device_iterator_ops.cc
@@ -86,12 +86,18 @@ class MultiDeviceIterator : public ResourceBase {
   void GetNextFromShard(IteratorContext* ctx, int shard_num,
                         int64 incarnation_id,
                         MultiDeviceIteratorCallback callback) {
-    if (lib_ != nullptr) {
-      ctx->set_lib(lib_);
+    if (ctx->lib() == lib_) {
+      tf_shared_lock l(mu_);
+      multi_device_buffer_->GetNextFromShard(ctx, shard_num, incarnation_id,
+                                             std::move(callback));
+    } else {
+      IteratorContext::Params params(ctx);
+      params.lib = lib_;
+      IteratorContext iter_ctx(std::move(params));
+      tf_shared_lock l(mu_);
+      multi_device_buffer_->GetNextFromShard(
+          &iter_ctx, shard_num, incarnation_id, std::move(callback));
     }
-    tf_shared_lock l(mu_);
-    multi_device_buffer_->GetNextFromShard(ctx, shard_num, incarnation_id,
-                                           std::move(callback));
   }
 
   const DataTypeVector& output_types() const { return output_types_; }
@@ -455,8 +461,9 @@ class MultiDeviceIteratorInitOp : public OpKernel {
     core::ScopedUnref unref(resource);
 
     std::unique_ptr<IteratorBase> iterator;
-    IteratorContext iter_ctx(ctx);
-    iter_ctx.set_lib(resource->lib());
+    IteratorContext::Params params(ctx);
+    params.lib = resource->lib();
+    IteratorContext iter_ctx(std::move(params));
     OP_REQUIRES_OK(
         ctx, dataset->MakeIterator(std::move(iter_ctx), "Iterator", &iterator));
     int64 incarnation_id;
@@ -496,16 +503,6 @@ class MultiDeviceIteratorGetNextFromShardOp : public AsyncOpKernel {
         ctx, LookupResource(ctx, HandleFromInput(ctx, 0), &iterator), done);
     background_worker_.Schedule(std::bind(
         [ctx, iterator, shard_num, incarnation_id](DoneCallback done) {
-          IteratorContext::Params params;
-          params.env = ctx->env();
-          params.runner = *(ctx->runner());
-          params.function_library = iterator->function_library();
-          DeviceBase* device = ctx->function_library()->device();
-          params.allocator_getter = [device](AllocatorAttributes attrs) {
-            return device->GetAllocator(attrs);
-          };
-          IteratorContext iter_ctx(std::move(params));
-
           MultiDeviceIteratorCallback callback = std::bind(
               [ctx](const HostBufferElement& elem, DoneCallback done) {
                 // iterator->Unref();
@@ -523,6 +520,9 @@ class MultiDeviceIteratorGetNextFromShardOp : public AsyncOpKernel {
               },
               std::placeholders::_1, std::move(done));
 
+          IteratorContext::Params params(ctx);
+          params.function_library = iterator->function_library();
+          IteratorContext iter_ctx(std::move(params));
           iterator->GetNextFromShard(&iter_ctx, shard_num, incarnation_id,
                                      callback);
           iterator->Unref();
diff --git a/tensorflow/core/kernels/data/optimize_dataset_op.cc b/tensorflow/core/kernels/data/optimize_dataset_op.cc
index eddaefb4283..213ee7c6018 100644
--- a/tensorflow/core/kernels/data/optimize_dataset_op.cc
+++ b/tensorflow/core/kernels/data/optimize_dataset_op.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/graph_view.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/grappler_item_builder.h"
+#include "tensorflow/core/grappler/optimizers/data/graph_utils.h"
 #include "tensorflow/core/grappler/optimizers/meta_optimizer.h"
 #include "tensorflow/core/lib/random/random.h"
 #include "tensorflow/core/protobuf/meta_graph.pb.h"
@@ -164,19 +165,19 @@ class OptimizeDatasetOp : public UnaryDatasetOpKernel {
           : DatasetIterator<Dataset>(params) {}
 
       Status Initialize(IteratorContext* ctx) override {
-        IteratorContext::Params params = ctx->params();
+        IteratorContext::Params params(ctx);
         params.lib = dataset()->lib_;
         return dataset()->optimized_input_->MakeIterator(
-            IteratorContext(params), prefix(), &input_impl_);
+            IteratorContext(std::move(params)), prefix(), &input_impl_);
       }
 
       Status GetNextInternal(IteratorContext* ctx,
                              std::vector<Tensor>* out_tensors,
                              bool* end_of_sequence) override {
-        IteratorContext::Params params = ctx->params();
+        IteratorContext::Params params(ctx);
         params.lib = dataset()->lib_;
-        return input_impl_->GetNext(IteratorContext(params), out_tensors,
-                                    end_of_sequence);
+        return input_impl_->GetNext(IteratorContext(std::move(params)),
+                                    out_tensors, end_of_sequence);
       }
 
      protected:
@@ -203,11 +204,16 @@ class OptimizeDatasetOp : public UnaryDatasetOpKernel {
 
     Status ApplyOptimizations(OpKernelContext* ctx, GraphDef* graph_def,
                               string* output_node) {
-      // Add a fake sink node to allow rewriting the actual sink node.
+      // Add an identity node as the fetch node, otherwise we might get
+      // 'placeholder is both fed and fetched' errors in some cases when using
+      // input list with placeholder dataset nodes.
       NodeDef* node = graph_def->mutable_node()->Add();
-      node->set_name("FakeSink");
-      node->set_op("SinkDataset");
+      tensorflow::grappler::graph_utils::SetUniqueGraphNodeName(
+          "Sink", graph_def, node);
+      node->set_op("Identity");
       node->add_input(*output_node);
+      (*node->mutable_attr())["T"].set_type(DT_VARIANT);
+      *output_node = node->name();
 
       // Create metagraph.
       MetaGraphDef meta_graph_def;
@@ -216,7 +222,7 @@ class OptimizeDatasetOp : public UnaryDatasetOpKernel {
       // Grappler determines fetch ops from collection 'train_op'.
       CollectionDef collection_def;
       auto node_list = collection_def.mutable_node_list();
-      node_list->add_value("FakeSink");
+      node_list->add_value(*output_node);
       (*meta_graph_def.mutable_collection_def())["train_op"] = collection_def;
 
       // Create Grappler item.
@@ -260,14 +266,6 @@ class OptimizeDatasetOp : public UnaryDatasetOpKernel {
       TF_RETURN_IF_ERROR(tensorflow::grappler::RunMetaOptimizer(
           *grappler_item, rewriter_config, ctx->device(), &cluster, graph_def));
 
-      // Set `output_node` to the input of the fake sink node.
-      {
-        grappler::GraphView graph(graph_def);
-        grappler::GraphView::InputPort input_port =
-            graph.GetInputPort("FakeSink", 0);
-        *output_node = graph.GetRegularFanin(input_port).node->name();
-      }
-
       return Status::OK();
     }
 
diff --git a/tensorflow/core/kernels/data/parallel_map_iterator.cc b/tensorflow/core/kernels/data/parallel_map_iterator.cc
index 1fa9a1fdc50..ec1c9238430 100644
--- a/tensorflow/core/kernels/data/parallel_map_iterator.cc
+++ b/tensorflow/core/kernels/data/parallel_map_iterator.cc
@@ -65,9 +65,7 @@ class ParallelMapIterator : public DatasetBaseIterator {
   Status Initialize(IteratorContext* ctx) override {
     mutex_lock l(*mu_);
     if (num_parallel_calls_->value == kAutoTune) {
-      // TODO(jsimsa): Surface the number of threads used by `ctx->runner()` and
-      // use it here for the default.
-      num_parallel_calls_->value = port::NumSchedulableCPUs();
+      num_parallel_calls_->value = ctx->runner_threadpool_size();
       num_parallel_calls_->tunable = true;
     }
     TF_RETURN_IF_ERROR(
@@ -103,7 +101,7 @@ class ParallelMapIterator : public DatasetBaseIterator {
         std::move(args),
         /*ratio=*/1,
         {model::MakeParameter("parallelism", num_parallel_calls_, /*min=*/1,
-                              /*max=*/port::NumSchedulableCPUs())});
+                              /*max=*/ctx->runner_threadpool_size())});
   }
 
   Status SaveInternal(IteratorStateWriter* writer) override {
diff --git a/tensorflow/core/kernels/data/range_dataset_op.cc b/tensorflow/core/kernels/data/range_dataset_op.cc
index 1ad5b007751..207e957e374 100644
--- a/tensorflow/core/kernels/data/range_dataset_op.cc
+++ b/tensorflow/core/kernels/data/range_dataset_op.cc
@@ -104,9 +104,8 @@ class RangeDatasetOp : public DatasetOpKernel {
           *end_of_sequence = true;
           return Status::OK();
         }
-        out_tensors->emplace_back(ctx->allocator({}), DT_INT64,
-                                  TensorShape({}));
-        out_tensors->back().scalar<int64>()() = next_;
+        out_tensors->reserve(1);
+        out_tensors->emplace_back(next_);
         *end_of_sequence = false;
         next_ += dataset()->step_;
 
diff --git a/tensorflow/core/kernels/data/reader_dataset_ops.cc b/tensorflow/core/kernels/data/reader_dataset_ops.cc
index ea97cf5ffdc..971fd2a4368 100644
--- a/tensorflow/core/kernels/data/reader_dataset_ops.cc
+++ b/tensorflow/core/kernels/data/reader_dataset_ops.cc
@@ -271,6 +271,9 @@ REGISTER_KERNEL_BUILDER(Name("TextLineDataset").Device(DEVICE_CPU),
 class FixedLengthRecordDatasetOp : public DatasetOpKernel {
  public:
   using DatasetOpKernel::DatasetOpKernel;
+  explicit FixedLengthRecordDatasetOp(OpKernelConstruction* ctx)
+      : DatasetOpKernel(ctx),
+        op_version_(ctx->def().op() == "FixedLengthRecordDataset" ? 1 : 2) {}
 
   void MakeDataset(OpKernelContext* ctx, DatasetBase** output) override {
     const Tensor* filenames_tensor;
@@ -311,9 +314,17 @@ class FixedLengthRecordDatasetOp : public DatasetOpKernel {
     if (buffer_size == 0) {
       buffer_size = 256 << 10;  // 256 kB as default.
     }
-
+    string compression_type;
+    if (op_version_ > 1) {
+      OP_REQUIRES_OK(ctx, ParseScalarArgument<string>(ctx, "compression_type",
+                                                      &compression_type));
+      OP_REQUIRES(ctx,
+                  compression_type.empty() || compression_type == "ZLIB" ||
+                      compression_type == "GZIP",
+                  errors::InvalidArgument("Unsupported compression_type."));
+    }
     *output = new Dataset(ctx, std::move(filenames), header_bytes, record_bytes,
-                          footer_bytes, buffer_size);
+                          footer_bytes, buffer_size, compression_type);
   }
 
  private:
@@ -321,18 +332,24 @@ class FixedLengthRecordDatasetOp : public DatasetOpKernel {
    public:
     explicit Dataset(OpKernelContext* ctx, std::vector<string> filenames,
                      int64 header_bytes, int64 record_bytes, int64 footer_bytes,
-                     int64 buffer_size)
+                     int64 buffer_size, const string& compression_type)
         : DatasetBase(DatasetContext(ctx)),
           filenames_(std::move(filenames)),
           header_bytes_(header_bytes),
           record_bytes_(record_bytes),
           footer_bytes_(footer_bytes),
-          buffer_size_(buffer_size) {}
+          buffer_size_(buffer_size),
+          compression_type_(compression_type) {}
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      return std::unique_ptr<IteratorBase>(
-          new Iterator({this, strings::StrCat(prefix, "::FixedLengthRecord")}));
+      if (compression_type_.empty()) {
+        return std::unique_ptr<IteratorBase>(new UncompressedIterator(
+            {this, strings::StrCat(prefix, "::FixedLengthRecord")}));
+      } else {
+        return std::unique_ptr<IteratorBase>(new CompressedIterator(
+            {this, strings::StrCat(prefix, "::FixedLengthRecord")}));
+      }
     }
 
     const DataTypeVector& output_dtypes() const override {
@@ -359,22 +376,25 @@ class FixedLengthRecordDatasetOp : public DatasetOpKernel {
       Node* record_bytes = nullptr;
       Node* footer_bytes = nullptr;
       Node* buffer_size = nullptr;
+      Node* compression_type = nullptr;
       TF_RETURN_IF_ERROR(b->AddVector(filenames_, &filenames));
       TF_RETURN_IF_ERROR(b->AddScalar(header_bytes_, &header_bytes));
       TF_RETURN_IF_ERROR(b->AddScalar(record_bytes_, &record_bytes));
       TF_RETURN_IF_ERROR(b->AddScalar(footer_bytes_, &footer_bytes));
       TF_RETURN_IF_ERROR(b->AddScalar(buffer_size_, &buffer_size));
-      TF_RETURN_IF_ERROR(b->AddDataset(
-          this,
-          {filenames, header_bytes, record_bytes, footer_bytes, buffer_size},
-          output));
+      TF_RETURN_IF_ERROR(b->AddScalar(compression_type_, &compression_type));
+      TF_RETURN_IF_ERROR(
+          b->AddDataset(this,
+                        {filenames, header_bytes, record_bytes, footer_bytes,
+                         buffer_size, compression_type},
+                        output));
       return Status::OK();
     }
 
    private:
-    class Iterator : public DatasetIterator<Dataset> {
+    class UncompressedIterator : public DatasetIterator<Dataset> {
      public:
-      explicit Iterator(const Params& params)
+      explicit UncompressedIterator(const Params& params)
           : DatasetIterator<Dataset>(params) {}
 
       Status GetNextInternal(IteratorContext* ctx,
@@ -391,9 +411,9 @@ class FixedLengthRecordDatasetOp : public DatasetOpKernel {
               TF_RETURN_IF_ERROR(
                   input_buffer_->ReadNBytes(dataset()->record_bytes_, &record));
               // Produce the record as output.
-              out_tensors->emplace_back(ctx->allocator({}), DT_STRING,
-                                        TensorShape({}));
-              out_tensors->back().scalar<string>()() = record;
+              Tensor record_tensor(ctx->allocator({}), DT_STRING, {});
+              record_tensor.scalar<string>()() = record;
+              out_tensors->emplace_back(std::move(record_tensor));
               *end_of_sequence = false;
               return Status::OK();
             }
@@ -440,11 +460,6 @@ class FixedLengthRecordDatasetOp : public DatasetOpKernel {
       }
 
      protected:
-      std::shared_ptr<model::Node> CreateNode(
-          IteratorContext* ctx, model::Node::Args args) const override {
-        return model::MakeSourceNode(std::move(args));
-      }
-
       Status SaveInternal(IteratorStateWriter* writer) override {
         mutex_lock l(mu_);
         TF_RETURN_IF_ERROR(writer->WriteScalar(full_name("current_file_index"),
@@ -497,16 +512,207 @@ class FixedLengthRecordDatasetOp : public DatasetOpKernel {
       int64 file_pos_limit_ GUARDED_BY(mu_) = -1;
     };
 
+    class CompressedIterator : public DatasetIterator<Dataset> {
+     public:
+      explicit CompressedIterator(const Params& params)
+          : DatasetIterator<Dataset>(params) {}
+
+      Status GetNextInternal(IteratorContext* ctx,
+                             std::vector<Tensor>* out_tensors,
+                             bool* end_of_sequence) override {
+        mutex_lock l(mu_);
+        do {
+          // We are currently processing a file, so try to read the next record.
+          if (buffered_input_stream_) {
+            const int64 current_pos = buffered_input_stream_->Tell();
+            if (dataset()->compression_type_.empty()) {
+              DCHECK_GE(file_pos_limit_, 0);
+              if (current_pos < file_pos_limit_) {
+                string record;
+                TF_RETURN_IF_ERROR(buffered_input_stream_->ReadNBytes(
+                    dataset()->record_bytes_, &record));
+                // Produce the record as output.
+                Tensor record_tensor(ctx->allocator({}), DT_STRING, {});
+                record_tensor.scalar<string>()() = std::move(record);
+                out_tensors->emplace_back(std::move(record_tensor));
+                *end_of_sequence = false;
+                return Status::OK();
+              }
+            } else {
+              string record;
+              Status s = buffered_input_stream_->ReadNBytes(
+                  dataset()->record_bytes_, &record);
+              if (s.ok()) {
+                lookahead_cache_.append(record);
+                record = lookahead_cache_.substr(0, dataset()->record_bytes_);
+                lookahead_cache_ =
+                    lookahead_cache_.substr(dataset()->record_bytes_);
+                // Produce the record as output.
+                Tensor record_tensor(ctx->allocator({}), DT_STRING, {});
+                record_tensor.scalar<string>()() = std::move(record);
+                out_tensors->emplace_back(std::move(record_tensor));
+                *end_of_sequence = false;
+                return Status::OK();
+              }
+              if (errors::IsOutOfRange(s) && !record.empty()) {
+                uint64 body_size =
+                    current_pos + record.size() -
+                    (dataset()->header_bytes_ + dataset()->footer_bytes_);
+                return errors::DataLoss(
+                    "Excluding the header (", dataset()->header_bytes_,
+                    " bytes) and footer (", dataset()->footer_bytes_,
+                    " bytes), input file \"",
+                    dataset()->filenames_[current_file_index_],
+                    "\" has body length ", body_size,
+                    " bytes, which is not an exact multiple of the record "
+                    "length (",
+                    dataset()->record_bytes_, " bytes).");
+              }
+            }
+
+            // We have reached the end of the current file, so maybe
+            // move on to next file.
+            buffered_input_stream_.reset();
+            file_.reset();
+            ++current_file_index_;
+          }
+
+          // Iteration ends when there are no more files to process.
+          if (current_file_index_ == dataset()->filenames_.size()) {
+            *end_of_sequence = true;
+            return Status::OK();
+          }
+
+          // Actually move on to next file.
+          if (dataset()->compression_type_.empty()) {
+            uint64 file_size;
+            TF_RETURN_IF_ERROR(ctx->env()->GetFileSize(
+                dataset()->filenames_[current_file_index_], &file_size));
+            file_pos_limit_ = file_size - dataset()->footer_bytes_;
+
+            uint64 body_size = file_size - (dataset()->header_bytes_ +
+                                            dataset()->footer_bytes_);
+
+            if (body_size % dataset()->record_bytes_ != 0) {
+              return errors::InvalidArgument(
+                  "Excluding the header (", dataset()->header_bytes_,
+                  " bytes) and footer (", dataset()->footer_bytes_,
+                  " bytes), input file \"",
+                  dataset()->filenames_[current_file_index_],
+                  "\" has body length ", body_size,
+                  " bytes, which is not an exact multiple of the record length "
+                  "(",
+                  dataset()->record_bytes_, " bytes).");
+            }
+          }
+          TF_RETURN_IF_ERROR(ctx->env()->NewRandomAccessFile(
+              dataset()->filenames_[current_file_index_], &file_));
+          if (!dataset()->compression_type_.empty()) {
+            const io::ZlibCompressionOptions zlib_options =
+                dataset()->compression_type_ == "ZLIB"
+                    ? io::ZlibCompressionOptions::DEFAULT()
+                    : io::ZlibCompressionOptions::GZIP();
+            file_stream_.reset(new io::RandomAccessInputStream(file_.get()));
+            buffered_input_stream_.reset(new io::ZlibInputStream(
+                file_stream_.get(), dataset()->buffer_size_,
+                dataset()->buffer_size_, zlib_options));
+          } else {
+            buffered_input_stream_.reset(new io::BufferedInputStream(
+                file_.get(), dataset()->buffer_size_));
+          }
+          TF_RETURN_IF_ERROR(
+              buffered_input_stream_->SkipNBytes(dataset()->header_bytes_));
+          lookahead_cache_.clear();
+          if (!dataset()->compression_type_.empty()) {
+            TF_RETURN_IF_ERROR(buffered_input_stream_->ReadNBytes(
+                dataset()->footer_bytes_, &lookahead_cache_));
+          }
+        } while (true);
+      }
+
+     protected:
+      std::shared_ptr<model::Node> CreateNode(
+          IteratorContext* ctx, model::Node::Args args) const override {
+        return model::MakeSourceNode(std::move(args));
+      }
+
+      Status SaveInternal(IteratorStateWriter* writer) override {
+        mutex_lock l(mu_);
+        TF_RETURN_IF_ERROR(writer->WriteScalar(full_name("current_file_index"),
+                                               current_file_index_));
+
+        // `buffered_input_stream_` is empty if
+        // 1. GetNext has not been called even once.
+        // 2. All files have been read and iterator has been exhausted.
+        int64 current_pos =
+            buffered_input_stream_ ? buffered_input_stream_->Tell() : -1;
+        TF_RETURN_IF_ERROR(
+            writer->WriteScalar(full_name("current_pos"), current_pos));
+        return Status::OK();
+      }
+
+      Status RestoreInternal(IteratorContext* ctx,
+                             IteratorStateReader* reader) override {
+        mutex_lock l(mu_);
+        int64 current_file_index;
+        TF_RETURN_IF_ERROR(reader->ReadScalar(full_name("current_file_index"),
+                                              &current_file_index));
+        current_file_index_ = size_t(current_file_index);
+        int64 current_pos;
+        TF_RETURN_IF_ERROR(
+            reader->ReadScalar(full_name("current_pos"), &current_pos));
+
+        // Seek to current_pos.
+        buffered_input_stream_.reset();
+        file_.reset();
+        if (current_pos >= 0) {  // There was an active buffered_input_stream_.
+          TF_RETURN_IF_ERROR(ctx->env()->NewRandomAccessFile(
+              dataset()->filenames_[current_file_index_], &file_));
+          const io::ZlibCompressionOptions zlib_options =
+              dataset()->compression_type_ == "ZLIB"
+                  ? io::ZlibCompressionOptions::DEFAULT()
+                  : io::ZlibCompressionOptions::GZIP();
+          file_stream_.reset(new io::RandomAccessInputStream(file_.get()));
+          buffered_input_stream_.reset(new io::ZlibInputStream(
+              file_stream_.get(), dataset()->buffer_size_,
+              dataset()->buffer_size_, zlib_options));
+          lookahead_cache_.clear();
+          TF_RETURN_IF_ERROR(buffered_input_stream_->SkipNBytes(
+              current_pos - dataset()->footer_bytes_));
+          TF_RETURN_IF_ERROR(buffered_input_stream_->ReadNBytes(
+              dataset()->footer_bytes_, &lookahead_cache_));
+        }
+
+        return Status::OK();
+      }
+
+     private:
+      mutex mu_;
+      size_t current_file_index_ GUARDED_BY(mu_) = 0;
+      std::unique_ptr<RandomAccessFile> file_
+          GUARDED_BY(mu_);  // must outlive buffered_input_stream_
+      std::unique_ptr<io::RandomAccessInputStream>
+          file_stream_;  // must outlive buffered_input_stream_
+      std::unique_ptr<io::InputStreamInterface> buffered_input_stream_
+          GUARDED_BY(mu_);
+      int64 file_pos_limit_ GUARDED_BY(mu_) = -1;
+      string lookahead_cache_ GUARDED_BY(mu_);
+    };
+
     const std::vector<string> filenames_;
     const int64 header_bytes_;
     const int64 record_bytes_;
     const int64 footer_bytes_;
     const int64 buffer_size_;
+    const string compression_type_;
   };
+  const int op_version_;
 };
 
 REGISTER_KERNEL_BUILDER(Name("FixedLengthRecordDataset").Device(DEVICE_CPU),
                         FixedLengthRecordDatasetOp);
+REGISTER_KERNEL_BUILDER(Name("FixedLengthRecordDatasetV2").Device(DEVICE_CPU),
+                        FixedLengthRecordDatasetOp);
 
 class TFRecordDatasetOp : public DatasetOpKernel {
  public:
diff --git a/tensorflow/core/kernels/data/stats_aggregator_dataset_op.cc b/tensorflow/core/kernels/data/stats_aggregator_dataset_op.cc
index 08d34eea878..a21b3fc16b7 100644
--- a/tensorflow/core/kernels/data/stats_aggregator_dataset_op.cc
+++ b/tensorflow/core/kernels/data/stats_aggregator_dataset_op.cc
@@ -163,19 +163,13 @@ class SetStatsAggregatorDatasetOp : public UnaryDatasetOpKernel {
         mutex_lock l(mu_);
         StatsAggregatorResource* stats_aggregator_resource =
             dataset()->stats_aggregator_resource_;
-        IteratorContext::Params params;
-        params.env = ctx->env();
-        params.runner = *(ctx->runner());
+        IteratorContext::Params params(ctx);
         params.stats_aggregator = std::shared_ptr<StatsAggregator>(
             new StatsAggregatorWithTagAndPrefix(
                 stats_aggregator_resource->stats_aggregator(), dataset()->tag_,
                 dataset()->prefix_));
-        params.lib = ctx->lib();
-        params.function_library = ctx->function_library();
-        params.allocator_getter = ctx->allocator_getter();
-        IteratorContext set_stats_aggregator_ctx(params);
-        return input_impl_->GetNext(&set_stats_aggregator_ctx, out_tensors,
-                                    end_of_sequence);
+        IteratorContext iter_ctx(std::move(params));
+        return input_impl_->GetNext(&iter_ctx, out_tensors, end_of_sequence);
       }
 
      protected:
diff --git a/tensorflow/core/kernels/data/unbatch_dataset_op.cc b/tensorflow/core/kernels/data/unbatch_dataset_op.cc
index ab8d8570336..b32ab8ba4fa 100644
--- a/tensorflow/core/kernels/data/unbatch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/unbatch_dataset_op.cc
@@ -151,7 +151,7 @@ class UnbatchDatasetOp : public UnaryDatasetOpKernel {
         // dimension. If it is statically known for any component, we model the
         // transformation using `KnownRatio`. Otherwise, we use `UnknownRatio`.
         for (auto& shape : dataset()->input_->output_shapes()) {
-          if (shape.dims() > 0 && shape.dim_size(0) != -1) {
+          if (shape.dims() > 0 && shape.dim_size(0) > 0) {
             return model::MakeKnownRatioNode(
                 std::move(args), 1.0 / static_cast<double>(shape.dim_size(0)));
           }
diff --git a/tensorflow/core/kernels/deep_conv2d.cc b/tensorflow/core/kernels/deep_conv2d.cc
index 1aa8c72d667..f9c8f16cb9a 100644
--- a/tensorflow/core/kernels/deep_conv2d.cc
+++ b/tensorflow/core/kernels/deep_conv2d.cc
@@ -500,8 +500,9 @@ class GemmFilterPacker {
   typedef Eigen::internal::const_blas_data_mapper<T, int64, Eigen::RowMajor>
       LhsMapper;
   typedef Eigen::internal::gebp_traits<T, T> Traits;
-  Eigen::internal::gemm_pack_lhs<T, int64, LhsMapper, Traits::mr,
-                                 Traits::LhsProgress, Eigen::RowMajor>
+  Eigen::internal::gemm_pack_lhs<
+      T, int64, LhsMapper, Traits::mr, Traits::LhsProgress,
+      typename Traits::LhsPacket4Packing, Eigen::RowMajor>
       pack_lhs;
 
   GemmFilterPacker(const int64 rows, const int64 depth, const T* lhs_input,
diff --git a/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc b/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc
index 76afd6f18c2..1398c876625 100644
--- a/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc
@@ -764,7 +764,7 @@ Status LaunchDepthwiseConv2dGPU(OpKernelContext* ctx, const DepthwiseArgs& args,
   const int max_block_count = kKnownFilterWidth < 0 || kKnownFilterHeight < 0 ||
                                       kKnownDepthMultiplier < 0
                                   ? std::numeric_limits<int>::max()
-                                  : device.getNumCudaMultiProcessors();
+                                  : device.getNumGpuMultiProcessors();
   kernel<<<std::min(max_block_count, config.block_count),
            config.thread_per_block, 0, device.stream()>>>(args, input, filter,
                                                           output, num_outputs);
diff --git a/tensorflow/core/kernels/eigen_contraction_kernel.h b/tensorflow/core/kernels/eigen_contraction_kernel.h
new file mode 100644
index 00000000000..92d29e39958
--- /dev/null
+++ b/tensorflow/core/kernels/eigen_contraction_kernel.h
@@ -0,0 +1,234 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_EIGEN_CONTRACTION_KERNEL_H_
+#define TENSORFLOW_CORE_KERNELS_EIGEN_CONTRACTION_KERNEL_H_
+
+// Depending on a build configuration this header provides custom kernel for
+// Eigen tensor contractions (small matrix multiplication kernel used to
+// multiple together blocks of the original tensors).
+//
+// 1) --define tensorflow_mkldnn_contraction_kernel=1
+//    Use Mkldnn single threaded sgemm. The mkldnn kernels are generated at
+//    runtime and use avx/avx2/fma/avx512 based on cpu status registers
+//    (https://en.wikipedia.org/wiki/CPUID).
+//
+// If you use `tensor.contract(other_tensor)` in your code, you must include
+// this header to get the benefit of custom contraction kernel:
+//
+//   #if defined(TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL)
+//   #include "tensorflow/core/kernels/eigen_contraction_kernel.h"
+//   #endif
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "third_party/intel_mkl_dnn/include/mkldnn.h"
+
+namespace Eigen {
+namespace internal {
+
+// Enabled by build option: "--define tensorflow_mkldnn_contraction_kernel=1"
+#if defined(TENSORFLOW_USE_MKLDNN_CONTRACTION_KERNEL)
+
+template <typename Scalar, typename IndexType, typename DataMapper,
+          int StorageOrder>
+struct mkldnn_gemm_pack;
+
+// mkl_gemm_pack for ColMajor storage order.
+template <typename Scalar, typename IndexType, typename DataMapper>
+struct mkldnn_gemm_pack<Scalar, IndexType, DataMapper,
+                        /*StorageOrder*/ ColMajor> {
+  typedef typename internal::packet_traits<Scalar>::type Packet;
+  typedef typename DataMapper::LinearMapper LinearMapper;
+
+  enum { PacketSize = internal::packet_traits<Scalar>::size };
+
+  EIGEN_DONT_INLINE
+  void operator()(Scalar* block, const DataMapper& data_mapper, IndexType rows,
+                  IndexType cols) {
+    const IndexType unrolled_rows =
+        (rows / (4 * PacketSize)) * (4 * PacketSize);
+    const IndexType vectorized_rows = (rows / PacketSize) * PacketSize;
+
+    for (IndexType col = 0; col < cols; ++col) {
+      LinearMapper lm = data_mapper.getLinearMapper(0, col);
+
+      // Give compiler a strong possibility to unroll the loop.
+      for (IndexType i = 0; i < unrolled_rows; i += 4 * PacketSize) {
+        for (IndexType j = 0; j < 4; ++j) {
+          const Packet p = lm.template loadPacket<Packet>(i + j * PacketSize);
+          internal::pstoreu(block + j * PacketSize, p);
+        }
+        block += 4 * PacketSize;
+      }
+
+      // Process remaining rows with packets.
+      for (IndexType i = unrolled_rows; i < vectorized_rows; i += PacketSize) {
+        const Packet p = lm.template loadPacket<Packet>(i);
+        internal::pstoreu(block, p);
+        block += PacketSize;
+      }
+
+      // Finalize with coefficients.
+      for (IndexType i = vectorized_rows; i < rows; ++i) {
+        *block = lm(i);
+        ++block;
+      }
+    }
+  }
+};
+
+template <typename Scalar, typename IndexType, typename OutputMapper,
+          bool ConjugateLhs = false, bool ConjugateRhs = false>
+struct mkldnn_gemm_kernel;
+
+// mkldnn_gemm_kernel for floats defined as a thin layer on top of mkldnn_sgemm.
+template <typename IndexType, typename OutputMapper, bool ConjugateLhs,
+          bool ConjugateRhs>
+struct mkldnn_gemm_kernel</*Scalar*/ float, IndexType, OutputMapper,
+                          ConjugateLhs, ConjugateRhs> {
+  EIGEN_DONT_INLINE
+  void operator()(const OutputMapper& output, const float* blockA,
+                  const float* blockB, const IndexType rows,
+                  const IndexType depth, const IndexType cols, float alpha) {
+    static const int max_index = (std::numeric_limits<int>::max)();
+
+    eigen_assert(max_index >= rows);
+    eigen_assert(max_index >= cols);
+    eigen_assert(max_index >= depth);
+    eigen_assert(max_index >= output.stride());
+
+    const int m = static_cast<int>(rows);
+    const int n = static_cast<int>(cols);
+    const int k = static_cast<int>(depth);
+
+    const char transposeA = ConjugateLhs ? 'Y' : 'N';
+    const char transposeB = ConjugateRhs ? 'Y' : 'N';
+
+    const int ldA = ConjugateLhs ? k : m;
+    const int ldB = ConjugateRhs ? n : k;
+    const int ldC = static_cast<int>(output.stride());
+
+    const float beta = 1.0;
+
+    mkldnn_status_t st = mkldnn_sgemm(&transposeA, &transposeB, &m, &n, &k,
+                                      &alpha, blockA, &ldA, blockB, &ldB, &beta,
+                                      const_cast<float*>(output.data()), &ldC);
+    eigen_assert(st == 0);
+  }
+};
+
+// For mkldnn_sgemm having the right dimensions (especially for small matrices)
+// is more important than fitting all the working set in L1/L2 caches.
+// TODO(ezhulenev): Do better heuristics.
+template <typename StorageIndex, int sharding_type>
+class TensorContractionBlocking<float, float, float, StorageIndex,
+                                sharding_type> {
+  // For now mkldnn has only mkldnn_sgemm (gemm for floats).
+  using Scalar = float;
+
+  // Adjust the block sizes to work well with mkldnn kernels.
+
+  // Multiply default choice of block size along M and N dimensions.
+  // TODO(ezhulenev): Explore if this can work in general (kScaleM=2.0 worked
+  // well in some of models).
+  static const float kScaleM = 1.5;
+  static const float kScaleN = 1.0;
+
+  // Mkldnn Avx/Avx2/Avx512 unroll factors are: 8/16/48.
+  static const StorageIndex kUnrollM = 48;
+
+  // Mkldnn Avx/Avx2/Avx512 unroll factors are: 6/6/8.
+  static const StorageIndex kUnrollN = 24;
+
+ public:
+  TensorContractionBlocking(StorageIndex k, StorageIndex m, StorageIndex n,
+                            StorageIndex num_threads = 1)
+      : kc_(k), mc_(m), nc_(n) {
+    // 1. Compute block sizes using default Eigen heuristics.
+    if (sharding_type == ShardByCol) {
+      computeProductBlockingSizes<Scalar, Scalar, 1>(kc_, mc_, nc_,
+                                                     num_threads);
+    } else {
+      computeProductBlockingSizes<Scalar, Scalar, 1>(kc_, nc_, mc_,
+                                                     num_threads);
+    }
+
+    // 2. And refine them to work well with mkldnn sgemm.
+    mc_ = (std::min)(
+        m, Eigen::divup(static_cast<StorageIndex>(mc_ * kScaleM), kUnrollM) *
+               kUnrollM);
+    nc_ = (std::min)(
+        n, Eigen::divup(static_cast<StorageIndex>(nc_ * kScaleN), kUnrollN) *
+               kUnrollN);
+
+    // We split Kth dimensions in roughly equal slices.
+    StorageIndex target_k_slices =
+        (std::max)(StorageIndex(1), Eigen::divup(k, kc_));
+    StorageIndex packet_size = 8;
+    StorageIndex target_bk =
+        Eigen::divup(k / target_k_slices, packet_size) * packet_size;
+    kc_ = (std::min)(k, target_bk);
+  }
+
+  EIGEN_ALWAYS_INLINE StorageIndex kc() const { return kc_; }
+  EIGEN_ALWAYS_INLINE StorageIndex mc() const { return mc_; }
+  EIGEN_ALWAYS_INLINE StorageIndex nc() const { return nc_; }
+
+ private:
+  StorageIndex kc_;
+  StorageIndex mc_;
+  StorageIndex nc_;
+};
+
+template <typename StorageIndex, typename OutputMapper, typename LhsMapper,
+          typename RhsMapper>
+struct TensorContractionKernel<float, float, float, StorageIndex, OutputMapper,
+                               LhsMapper, RhsMapper> {
+  // For now mkldnn has only mkldnn_sgemm (gemm for floats).
+  using Scalar = float;
+  using Traits = typename internal::gebp_traits<Scalar, Scalar>;
+
+  using LhsPacker = mkldnn_gemm_pack<Scalar, StorageIndex,
+                                     typename LhsMapper::SubMapper, ColMajor>;
+  using RhsPacker = mkldnn_gemm_pack<Scalar, StorageIndex,
+                                     typename RhsMapper::SubMapper, ColMajor>;
+  using GemmKernel = mkldnn_gemm_kernel<Scalar, StorageIndex, OutputMapper>;
+
+  EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void packLhs(
+      Scalar* lhsBlock, const typename LhsMapper::SubMapper& data_mapper,
+      const StorageIndex depth, const StorageIndex rows) {
+    LhsPacker()(lhsBlock, data_mapper, rows, depth);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void packRhs(
+      Scalar* rhsBlock, const typename RhsMapper::SubMapper& data_mapper,
+      const StorageIndex depth, const StorageIndex cols) {
+    RhsPacker()(rhsBlock, data_mapper, depth, cols);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void invoke(
+      const OutputMapper& output_mapper, const Scalar* lhsBlock,
+      const Scalar* rhsBlock, const StorageIndex rows, const StorageIndex depth,
+      const StorageIndex cols, const Scalar alpha) {
+    GemmKernel()(output_mapper, lhsBlock, rhsBlock, rows, depth, cols, alpha);
+  }
+};
+
+#endif  // defined(TENSORFLOW_USE_MKLDNN_CONTRACTION_KERNEL)
+
+}  // namespace internal
+}  // namespace Eigen
+
+#endif  // TENSORFLOW_CORE_KERNELS_EIGEN_CONTRACTION_KERNEL_H_
diff --git a/tensorflow/core/kernels/eigen_cuboid_convolution.h b/tensorflow/core/kernels/eigen_cuboid_convolution.h
index a98850cf4b3..3182307e51e 100644
--- a/tensorflow/core/kernels/eigen_cuboid_convolution.h
+++ b/tensorflow/core/kernels/eigen_cuboid_convolution.h
@@ -19,6 +19,10 @@ limitations under the License.
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/kernels/eigen_volume_patch.h"
 
+#if defined(TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL)
+#include "tensorflow/core/kernels/eigen_contraction_kernel.h"
+#endif
+
 namespace Eigen {
 
 namespace internal {
@@ -51,11 +55,10 @@ namespace internal {
 //   col - index of the extracted patch (in code: patchIndex)
 //         patchIndex ∈ [0..num_patches * OTHERS] (batch and other dimensions)
 //
-template <typename NewDimension, DenseIndex Planes, DenseIndex Rows,
-          DenseIndex Cols, typename ArgType, typename Device, typename Scalar_,
-          typename Index, typename nocontract_t, typename contract_t, int Side,
-          int packet_size, bool inner_dim_contiguous, bool inner_dim_reordered,
-          int Alignment>
+template <typename NewDimension, Index Planes, Index Rows, Index Cols,
+          typename ArgType, typename Device, typename Scalar_, typename Index,
+          typename nocontract_t, typename contract_t, int Side, int packet_size,
+          bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment>
 class TensorContractionInputMapper<
     Scalar_, Index, Side,
     TensorEvaluator<const TensorReshapingOp<NewDimension,
@@ -332,13 +335,6 @@ class TensorContractionInputMapper<
   EIGEN_DEVICE_FUNC
   EIGEN_ALWAYS_INLINE Index patchCols() const { return m_patch_cols; }
 
-  EIGEN_DEVICE_FUNC
-  EIGEN_ALWAYS_INLINE Packet packetNoPadding(const Index depth,
-                                             const Index baseIndex) const {
-    const Index inputIndex = depth + baseIndex;
-    return m_impl.template packet<Unaligned>(inputIndex);
-  }
-
  private:
   friend class TensorContractionSubMapper<
       Scalar, Index, Side,
@@ -681,11 +677,10 @@ class TensorContractionInputMapper<
   const TensorEvaluator<ArgType, Device> m_impl;
 };
 
-template <typename NewDimension, DenseIndex Planes, DenseIndex Rows,
-          DenseIndex Cols, typename ArgType, typename Device, typename Scalar,
-          typename Index, typename nocontract_t, typename contract_t, int Side,
-          int packet_size, bool inner_dim_contiguous, bool inner_dim_reordered,
-          int Alignment>
+template <typename NewDimension, Index Planes, Index Rows, Index Cols,
+          typename ArgType, typename Device, typename Scalar, typename Index,
+          typename nocontract_t, typename contract_t, int Side, int packet_size,
+          bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment>
 class TensorContractionSubMapper<
     Scalar, Index, Side,
     TensorEvaluator<const TensorReshapingOp<NewDimension,
@@ -880,6 +875,12 @@ class TensorContractionSubMapper<
     const Index inputIndex = depth + baseIndex;
     return m_base_mapper.m_impl.template packet<Unaligned>(inputIndex);
   }
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Scalar coeffNoPadding(const Index depth,
+                                            const Index baseIndex) const {
+    const Index inputIndex = depth + baseIndex;
+    return m_base_mapper.m_impl.coeff(inputIndex);
+  }
 
   EIGEN_DEVICE_FUNC
   EIGEN_ALWAYS_INLINE bool padPlane(const Index plane) const {
@@ -993,11 +994,14 @@ class TensorContractionSubMapper<
 // *) nr - number of registers along the 'n' dimension.
 //    See GeneralBlockPanelKernel.h and "Anatomy of High-Performance Matrix
 //    Multiplication" paper.
-template <typename NewDimension, DenseIndex Planes, DenseIndex Rows,
-          DenseIndex Cols, typename ArgType, typename Device, typename Scalar,
-          typename Index, typename nocontract_t, typename contract_t,
-          int packet_size, bool inner_dim_contiguous, bool inner_dim_reordered,
-          int Alignment, int nr>
+//
+// TODO(ezhulenev): Add support for squeezing reads along two innermost
+// dimensions (see eigen_spatial_convolutions).
+template <typename NewDimension, Index Planes, Index Rows, Index Cols,
+          typename ArgType, typename Device, typename Scalar, typename Index,
+          typename nocontract_t, typename contract_t, int packet_size,
+          bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment,
+          int nr>
 struct gemm_pack_rhs<
     Scalar, Index,
     TensorContractionSubMapper<
@@ -1172,11 +1176,13 @@ struct gemm_pack_rhs<
 
 // Template specialization for packet_size = 2. We must special-case packet
 // blocks with nr > packet_size, e.g. PacketBlock<Packet2d, 4>.
-template <typename NewDimension, DenseIndex Planes, DenseIndex Rows,
-          DenseIndex Cols, typename ArgType, typename Device, typename Scalar,
-          typename Index, typename nocontract_t, typename contract_t,
-          bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment,
-          int nr>
+//
+// TODO(ezhulenev): Add support for squeezing reads along two innermost
+// dimensions (see eigen_spatial_convolutions).
+template <typename NewDimension, Index Planes, Index Rows, Index Cols,
+          typename ArgType, typename Device, typename Scalar, typename Index,
+          typename nocontract_t, typename contract_t, bool inner_dim_contiguous,
+          bool inner_dim_reordered, int Alignment, int nr>
 struct gemm_pack_rhs<
     Scalar, Index,
     TensorContractionSubMapper<
@@ -1353,11 +1359,10 @@ struct gemm_pack_rhs<
 };
 
 // Special case for non-vectorized types such as float16 (packet_size = 1).
-template <typename NewDimension, DenseIndex Planes, DenseIndex Rows,
-          DenseIndex Cols, typename ArgType, typename Device, typename Scalar,
-          typename Index, typename nocontract_t, typename contract_t,
-          bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment,
-          int nr>
+template <typename NewDimension, Index Planes, Index Rows, Index Cols,
+          typename ArgType, typename Device, typename Scalar, typename Index,
+          typename nocontract_t, typename contract_t, bool inner_dim_contiguous,
+          bool inner_dim_reordered, int Alignment, int nr>
 struct gemm_pack_rhs<
     Scalar, Index,
     TensorContractionSubMapper<
@@ -1427,6 +1432,170 @@ struct gemm_pack_rhs<
   }
 };
 
+#if defined(TENSORFLOW_USE_MKLDNN_CONTRACTION_KERNEL)
+// Arrange a block of the right input matrix (in our case it's always a "virtual
+// matrix" constructed from extracted volume patches) in contiguous memory.
+//
+// Mkldnn doesn't require Lhs/Rhs blocks to be packed in any specific format, so
+// this is basically the same as taking a slice of the matrix. Knowing
+// properties of the original patch op we can do it more efficient than default
+// mkldnn_gemm_pack.
+//
+// TODO(ezhulenev): mkldnn_gemm_pack for spatial convolutions supports squeezing
+// reads along the 2 innermost dimensions, add it here if needed.
+template <typename NewDimension, Index Planes, Index Rows, Index Cols,
+          typename ArgType, typename Device, typename Scalar,
+          typename StorageIndex, typename nocontract_t, typename contract_t,
+          int packet_size, bool inner_dim_contiguous, bool inner_dim_reordered,
+          int Alignment>
+struct mkldnn_gemm_pack<
+    Scalar, StorageIndex,
+    TensorContractionSubMapper<
+        Scalar, StorageIndex, Rhs,
+        TensorEvaluator<const TensorReshapingOp<
+                            NewDimension, const TensorVolumePatchOp<
+                                              Planes, Rows, Cols, ArgType> >,
+                        Device>,
+        nocontract_t, contract_t, packet_size, inner_dim_contiguous,
+        inner_dim_reordered, Alignment>,
+    ColMajor> {
+  typedef TensorContractionSubMapper<
+      Scalar, StorageIndex, Rhs,
+      TensorEvaluator<const TensorReshapingOp<
+                          NewDimension, const TensorVolumePatchOp<
+                                            Planes, Rows, Cols, ArgType> >,
+                      Device>,
+      nocontract_t, contract_t, packet_size, inner_dim_contiguous,
+      inner_dim_reordered, Alignment>
+      SubMapper;
+
+  typedef SubMapper DataMapper;
+  typedef typename packet_traits<Scalar>::type Packet;
+
+  EIGEN_DONT_INLINE
+  void operator()(Scalar* block, const DataMapper& rhs, StorageIndex rows,
+                  StorageIndex cols) {
+    const bool standard_patches = !rhs.nonStandardPatches();
+
+    if (standard_patches && rhs.patchDepth() % packet_size == 0) {
+      packStandardPatches<true>(block, rhs, rows, cols);
+
+    } else if (standard_patches) {
+      packStandardPatches<false>(block, rhs, rows, cols);
+
+    } else {
+      // With non-standard patches we don't do any vectorized loads.
+      // TODO(ezhulenev): It doesn't look like that we should completely give up
+      // on packets. Make this code path faster!
+      for (StorageIndex col = 0; col < cols; ++col) {
+        SubMapper lm = rhs.getLinearMapper(0, col);
+        for (StorageIndex i = 0; i < rows; ++i) {
+          *block = lm(i);
+          ++block;
+        }
+      }
+    }
+  }
+
+ private:
+  // Pack standard volume patches:
+  //
+  // - patch_depth_is_multiple_of_packet_size=true: We are guaranteed to have
+  //   depth dimension size to be a multiple of packet size, so we can skip all
+  //   non vectorized loads and checks.
+  //
+  template <bool patch_depth_is_multiple_of_packet_size>
+  EIGEN_ALWAYS_INLINE void packStandardPatches(Scalar* block,
+                                               const DataMapper& rhs,
+                                               StorageIndex rows,
+                                               StorageIndex cols) {
+    eigen_assert(!rhs.nonStandardPatches());
+
+    // Give vectorized_rows the name used in all other gemm_pack_rhs above.
+    const Index peeled_k = (rows / packet_size) * packet_size;
+
+    const Index start_col = rhs.colOffset();
+    const Index max_col = rhs.maxCol(peeled_k);
+
+    for (StorageIndex col = 0; col < cols; ++col) {
+      SubMapper lm = rhs.getLinearMapper(0, col);
+
+      Index k = 0;
+      for (Index c = start_col; c < max_col; ++c) {
+        eigen_assert(k <= peeled_k);
+
+        const Index start_row = (c == start_col) ? rhs.rowOffset() : 0;
+        const Index max_row = rhs.maxRow(peeled_k, c);
+        const bool pad_col = lm.padCol(c);
+
+        for (Index r = start_row; r < max_row; ++r) {
+          eigen_assert(k <= peeled_k);
+
+          const Index start_plane =
+              ((c == start_col) && (r == start_row)) ? rhs.planeOffset() : 0;
+          const Index max_plane = rhs.maxPlane(peeled_k, c, r);
+          const bool pad_row = pad_col || lm.padRow(r);
+
+          for (Index p = start_plane; p < max_plane; ++p) {
+            eigen_assert(k <= peeled_k);
+
+            const Index start_depth =
+                ((c == start_col) && (r == start_row) && (p == start_plane))
+                    ? rhs.depthOffset()
+                    : 0;
+            const Index max_depth = rhs.maxDepth(peeled_k - k, start_depth);
+
+            const bool pad = pad_col || pad_row || lm.padPlane(p);
+            const Index base_idx = lm.baseIndex(p, r, c);
+
+            if (patch_depth_is_multiple_of_packet_size)
+              eigen_assert((max_depth - start_depth) % packet_size == 0);
+
+            // If patch depth is a multiple of packet size, it's guaranteed that
+            // we can process all values in depth dimension with packets.
+            const Index max_vectorized_depth =
+                patch_depth_is_multiple_of_packet_size
+                    ? max_depth
+                    : max_depth - packet_size;
+
+            Index d = start_depth;
+
+            // 1. Process depth dimension with vectorized instructions.
+            for (; d < max_vectorized_depth; d += packet_size) {
+              eigen_assert(k < peeled_k);
+              const Packet packet = pad ? pset1<Packet>(Scalar(0))
+                                        : rhs.packetNoPadding(d, base_idx);
+              internal::pstoreu(block, packet);
+              block += packet_size;
+              k += packet_size;
+            }
+
+            // 2. Finish with coefficients.
+            if (!patch_depth_is_multiple_of_packet_size) {
+              for (; d < max_depth; d++) {
+                eigen_assert(k < peeled_k);
+                *block = pad ? Scalar(0) : rhs.coeffNoPadding(d, base_idx);
+                ++block;
+                ++k;
+              }
+            }
+          }
+        }
+      }
+
+      // The loop above should fill peeled_k elements.
+      eigen_assert(peeled_k == k);
+
+      // Fill remaining elements using loadCoeffStandard.
+      for (; k < rows; ++k) {
+        *block = lm.loadCoeffStandard(k);
+        ++block;
+      }
+    }
+  }
+};
+#endif  // defined(TENSORFLOW_USE_MKLDNN_CONTRACTION_KERNEL)
+
 }  // namespace internal
 
 /** CuboidConvolution
@@ -1478,9 +1647,8 @@ EIGEN_ALWAYS_INLINE static const typename internal::conditional<
                 const DSizes<typename internal::traits<Input>::Index, 2>,
                 const Kernel> > > >::type
 CuboidConvolution(const Input& input, const Kernel& kernel,
-                  const DenseIndex stridePlanes = 1,
-                  const DenseIndex strideRows = 1,
-                  const DenseIndex strideCols = 1,
+                  const Index stridePlanes = 1, const Index strideRows = 1,
+                  const Index strideCols = 1,
                   const PaddingType padding_type = PADDING_SAME) {
   typedef typename internal::traits<Input>::Index TensorIndex;
   TensorRef<Tensor<typename internal::traits<Input>::Scalar,
diff --git a/tensorflow/core/kernels/eigen_mkldnn_contraction_kernel_test.cc b/tensorflow/core/kernels/eigen_mkldnn_contraction_kernel_test.cc
new file mode 100644
index 00000000000..da4a61d1bda
--- /dev/null
+++ b/tensorflow/core/kernels/eigen_mkldnn_contraction_kernel_test.cc
@@ -0,0 +1,148 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/eigen_contraction_kernel.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace Eigen {
+namespace internal {
+
+namespace {
+template <typename Index, int NumDims>
+Eigen::array<Index, NumDims> RandomDims(int min_dim = 1, int max_dim = 20) {
+  Eigen::array<Index, NumDims> dims;
+  for (int i = 0; i < NumDims; ++i) {
+    dims[i] = internal::random<int>(min_dim, max_dim);
+  }
+  return dims;
+}
+}  // namespace
+
+using Scalar = float;
+using Index = Eigen::Index;
+
+TEST(EigenMkldnnTest, MkldnnPack) {
+  // Packing with mkldnn_gemm_pack is the same as taking a slice of 2
+  // dimensional Tensor.
+
+  // Mkldnn pack and gemm are used only in Tensor contractions, and it's
+  // guaranteed that Tensors will have ColMajor layout.
+  static const int Options = ColMajor;
+
+  using DataMapper = blas_data_mapper<Scalar, Index, ColMajor>;
+  using MkldnnGemmPack = mkldnn_gemm_pack<Scalar, Index, DataMapper, ColMajor>;
+  using Tensor2d = Tensor<Scalar, 2, Options, Index>;
+
+  Eigen::array<Index, 2> dims = RandomDims<Index, 2>(1, 500);
+
+  // Create a tensor initialized with random data.
+  Tensor2d src(dims);
+  src.setRandom();
+
+  // Pick a random slice of src tensor.
+  Eigen::array<Index, 2> slice_start = RandomDims<Index, 2>(0, 250);
+  Eigen::array<Index, 2> slice_size = RandomDims<Index, 2>(100, 500);
+
+  // Make sure that slice start + size do not overflow tensor dims.
+  for (int i = 0; i < 2; ++i) {
+    slice_start[i] = numext::mini(dims[i] - 1, slice_start[i]);
+    slice_size[i] = numext::mini(slice_size[i], dims[i] - slice_start[i]);
+  }
+
+  // Prepare tensors for packing and slicing results.
+  Tensor2d pack_dst(slice_size[0], slice_size[1]);
+  Tensor2d slice_dst(slice_size[0], slice_size[1]);
+
+  // Pack memory using mkldnn_gemm_pack.
+  DataMapper data_mapper(src.data(), dims[0]);
+  MkldnnGemmPack gemm_pack;
+  gemm_pack(pack_dst.data(),
+            data_mapper.getSubMapper(slice_start[0], slice_start[1]),
+            slice_size[0], slice_size[1]);
+
+  // Slice the source tensor.
+  slice_dst = src.slice(slice_start, slice_size);
+
+  // Verify that dst tensors are equal.
+  EXPECT_EQ(pack_dst.dimensions().TotalSize(),
+            slice_dst.dimensions().TotalSize());
+  for (size_t i = 0; i < pack_dst.dimensions().TotalSize(); ++i) {
+    Scalar packed = pack_dst.coeff(i);
+    Scalar sliced = slice_dst.coeff(i);
+    EXPECT_EQ(packed, sliced);
+  }
+}
+
+TEST(EigenMkldnnTest, MkldnnGemm) {
+  // Mkldnn pack and gemm are used only in Tensor contractions, and it's
+  // guaranteed that Tensors will have ColMajor layout.
+  static const int Options = ColMajor;
+
+  using Tensor2d = Tensor<Scalar, 2, Options, Index>;
+
+  int m = internal::random<int>(1, 100);
+  int n = internal::random<int>(1, 100);
+  int k = internal::random<int>(1, 100);
+
+  Tensor2d lhs(m, k);
+  lhs.setRandom();
+
+  Tensor2d rhs(k, n);
+  rhs.setRandom();
+
+  // Compute matmul with mkldnn gemm kernel.
+  using OutputMapper = blas_data_mapper<Scalar, Index, ColMajor>;
+  using MkldnnGemmKernel =
+      mkldnn_gemm_kernel<Scalar, Index, OutputMapper, ColMajor>;
+
+  Tensor2d mkldnn_result(m, n);
+  mkldnn_result.setZero();
+  OutputMapper output_mapper(mkldnn_result.data(), m);
+
+  MkldnnGemmKernel gemm_kernel;
+  gemm_kernel(output_mapper, lhs.data(), rhs.data(), m, k, n, /*alpha=*/1.0);
+
+  // Compute matmul with Eigen::Matrix.
+  using Matrix = Eigen::Matrix<Scalar, Dynamic, Dynamic, ColMajor>;
+  using MatrixMap = Map<Eigen::Matrix<Scalar, Dynamic, Dynamic, ColMajor>>;
+
+  MatrixMap lhs_mat(lhs.data(), m, k);
+  MatrixMap rhs_mat(rhs.data(), k, n);
+
+  Matrix matmul_result(m, n);
+  matmul_result.setZero();
+  matmul_result = lhs_mat * rhs_mat;
+
+  // Verify that results are equal.
+  for (Index i = 0; i < m * n; ++i) {
+    Scalar gemm = mkldnn_result(i);
+    Scalar matmul = matmul_result(i % m, i / m);
+
+    Scalar delta = std::abs(gemm - matmul);
+
+    // NOTE(rmlarsen): Compute proper forward error bound.
+    Scalar sum = Scalar(0.0);
+    for (int k1 = 0; k1 < k; ++k1) {
+      sum += std::abs(lhs_mat(i % m, k1) * rhs_mat(k1, i / m));
+    }
+    Scalar epsilon = std::numeric_limits<Scalar>::epsilon();
+    Scalar upper_bound = Scalar(1.01) * epsilon * k * sum;
+
+    EXPECT_LE(delta, upper_bound);
+  }
+}
+
+}  // namespace internal
+}  // namespace Eigen
diff --git a/tensorflow/core/kernels/eigen_spatial_convolutions.h b/tensorflow/core/kernels/eigen_spatial_convolutions.h
index a08c7064d58..1f211b19b4a 100644
--- a/tensorflow/core/kernels/eigen_spatial_convolutions.h
+++ b/tensorflow/core/kernels/eigen_spatial_convolutions.h
@@ -18,6 +18,10 @@ limitations under the License.
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 
+#if defined(TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL)
+#include "tensorflow/core/kernels/eigen_contraction_kernel.h"
+#endif
+
 namespace Eigen {
 
 namespace internal {
@@ -52,8 +56,8 @@ namespace internal {
 //
 // TODO(ezhulenev): Consolidate this part of the code with the image patch
 // extraction code since they are both very similar.
-template <typename NewDimension, DenseIndex Rows, DenseIndex Cols,
-          typename ArgType, typename Device, typename Scalar_, typename Index,
+template <typename NewDimension, Index Rows, Index Cols, typename ArgType,
+          typename Device, typename Scalar_, typename Index,
           typename nocontract_t, typename contract_t, int Side, int packet_size,
           bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment>
 class TensorContractionInputMapper<
@@ -264,13 +268,6 @@ class TensorContractionInputMapper<
   EIGEN_DEVICE_FUNC
   EIGEN_ALWAYS_INLINE Index patchCols() const { return m_patch_cols; }
 
-  EIGEN_DEVICE_FUNC
-  EIGEN_ALWAYS_INLINE Packet packetNoPadding(const Index depth,
-                                             const Index baseIndex) const {
-    const Index inputIndex = depth + baseIndex;
-    return m_impl.template packet<Unaligned>(inputIndex);
-  }
-
  private:
   friend class TensorContractionSubMapper<
       Scalar, Index, Side,
@@ -511,8 +508,8 @@ class TensorContractionInputMapper<
   const TensorEvaluator<ArgType, Device> m_impl;
 };
 
-template <typename NewDimension, DenseIndex Rows, DenseIndex Cols,
-          typename ArgType, typename Device, typename Scalar, typename Index,
+template <typename NewDimension, Index Rows, Index Cols, typename ArgType,
+          typename Device, typename Scalar, typename Index,
           typename nocontract_t, typename contract_t, int Side, int packet_size,
           bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment>
 class TensorContractionSubMapper<
@@ -682,6 +679,12 @@ class TensorContractionSubMapper<
     const Index inputIndex = depth + baseIndex;
     return m_base_mapper.m_impl.template packet<Unaligned>(inputIndex);
   }
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Scalar coeffNoPadding(const Index depth,
+                                            const Index baseIndex) const {
+    const Index inputIndex = depth + baseIndex;
+    return m_base_mapper.m_impl.coeff(inputIndex);
+  }
 
   EIGEN_DEVICE_FUNC
   EIGEN_ALWAYS_INLINE bool padRow(const Index row) const {
@@ -701,6 +704,15 @@ class TensorContractionSubMapper<
            c * m_base_mapper.m_colInputStride + m_otherIndex;
   }
 
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Index rowStride() const {
+    return m_base_mapper.m_row_strides;
+  }
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Index colStride() const {
+    return m_base_mapper.m_col_strides;
+  }
+
   EIGEN_DEVICE_FUNC
   EIGEN_ALWAYS_INLINE Index rowOffset() const {
     const Index patchOffset = m_depth_offset / m_base_mapper.m_fastDimZero;
@@ -770,8 +782,8 @@ class TensorContractionSubMapper<
 // *) nr - number of registers along the 'n' dimension.
 //    See GeneralBlockPanelKernel.h and "Anatomy of High-Performance Matrix
 //    Multiplication" paper.
-template <typename NewDimension, DenseIndex Rows, DenseIndex Cols,
-          typename ArgType, typename Device, typename Scalar, typename Index,
+template <typename NewDimension, Index Rows, Index Cols, typename ArgType,
+          typename Device, typename Scalar, typename Index,
           typename nocontract_t, typename contract_t, int packet_size,
           bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment,
           int nr>
@@ -837,6 +849,55 @@ struct gemm_pack_rhs<
             const bool pad_col2 = dm2.padCol(c);
             const bool pad_col3 = dm3.padCol(c);
 
+            // We can squeeze reads along the `row` and `depth` dimensions if
+            // the row stride is `1`, which means that `row` and `depth`
+            // dimensions are contiguous (two innermost dimensions).
+            if (rhs.rowStride() == 1 &&                                //
+                !pad_col0 && !pad_col1 && !pad_col2 && !pad_col3 &&    //
+                !dm0.padRow(start_row) && !dm0.padRow(max_row - 1) &&  //
+                !dm1.padRow(start_row) && !dm1.padRow(max_row - 1) &&  //
+                !dm2.padRow(start_row) && !dm2.padRow(max_row - 1) &&  //
+                !dm3.padRow(start_row) && !dm3.padRow(max_row - 1)) {
+              // Compute how many elements we can squeeze read.
+              const Index start_depth =
+                  (c == start_col) ? rhs.depthOffset() : 0;
+
+              // Upper bound for the number of elements in the depth dimension
+              // that we can squeeze read.
+              const Index squeeze_length =
+                  (max_row - start_row) * rhs.patchDepth() - start_depth;
+
+              // Do not overshoot beyond the block size.
+              const Index max_depth =
+                  start_depth + std::min<Index>(peeled_k - k, squeeze_length);
+              eigen_assert((max_depth - start_depth) % packet_size == 0);
+
+              const Index idx0 = dm0.baseIndex(start_row, c);
+              const Index idx1 = dm1.baseIndex(start_row, c);
+              const Index idx2 = dm2.baseIndex(start_row, c);
+              const Index idx3 = dm3.baseIndex(start_row, c);
+
+              for (Index d = start_depth; d < max_depth; d += packet_size) {
+                eigen_assert(k < peeled_k);
+                PacketBlock<Packet, 4> kernel;
+                kernel.packet[0] = rhs.packetNoPadding(d, idx0);
+                kernel.packet[1] = rhs.packetNoPadding(d, idx1);
+                kernel.packet[2] = rhs.packetNoPadding(d, idx2);
+                kernel.packet[3] = rhs.packetNoPadding(d, idx3);
+                ptranspose(kernel);
+                pstoreu(block + 0 * packet_size, kernel.packet[0]);
+                pstoreu(block + 1 * packet_size, kernel.packet[1]);
+                pstoreu(block + 2 * packet_size, kernel.packet[2]);
+                pstoreu(block + 3 * packet_size, kernel.packet[3]);
+                block += 4 * packet_size;
+                k += packet_size;
+              }
+
+              // Go to the next column.
+              continue;
+            }
+
+            // If we can't squeeze reads, process rows one by one.
             for (Index r = start_row; r < max_row; ++r) {
               eigen_assert(k <= peeled_k);
 
@@ -931,8 +992,8 @@ struct gemm_pack_rhs<
 
 // Template specialization for packet_size = 2. We must special-case packet
 // blocks with nr > packet_size, e.g. PacketBlock<Packet2d, 4>.
-template <typename NewDimension, DenseIndex Rows, DenseIndex Cols,
-          typename ArgType, typename Device, typename Scalar, typename Index,
+template <typename NewDimension, Index Rows, Index Cols, typename ArgType,
+          typename Device, typename Scalar, typename Index,
           typename nocontract_t, typename contract_t, bool inner_dim_contiguous,
           bool inner_dim_reordered, int Alignment, int nr>
 struct gemm_pack_rhs<
@@ -998,6 +1059,56 @@ struct gemm_pack_rhs<
             const bool pad_col2 = dm2.padCol(c);
             const bool pad_col3 = dm3.padCol(c);
 
+            // We can squeeze reads along the `row` and `depth` dimensions if
+            // the row stride is `1`, which means that `row` and `depth`
+            // dimensions are contiguous (two innermost dimensions).
+            if (rhs.rowStride() == 1 &&                                //
+                !pad_col0 && !pad_col1 && !pad_col2 && !pad_col3 &&    //
+                !dm0.padRow(start_row) && !dm0.padRow(max_row - 1) &&  //
+                !dm1.padRow(start_row) && !dm1.padRow(max_row - 1) &&  //
+                !dm2.padRow(start_row) && !dm2.padRow(max_row - 1) &&  //
+                !dm3.padRow(start_row) && !dm3.padRow(max_row - 1)) {
+              // Compute how many elements we can squeeze read.
+              const Index start_depth =
+                  (c == start_col) ? rhs.depthOffset() : 0;
+
+              // Upper bound for the number of elements in the depth dimension
+              // that we can squeeze read.
+              const Index squeeze_length =
+                  (max_row - start_row) * rhs.patchDepth() - start_depth;
+
+              // Do not overshoot beyond the block size.
+              const Index max_depth =
+                  start_depth + std::min<Index>(peeled_k - k, squeeze_length);
+              eigen_assert((max_depth - start_depth) % packet_size == 0);
+
+              const Index idx0 = dm0.baseIndex(start_row, c);
+              const Index idx1 = dm1.baseIndex(start_row, c);
+              const Index idx2 = dm2.baseIndex(start_row, c);
+              const Index idx3 = dm3.baseIndex(start_row, c);
+
+              for (Index d = start_depth; d < max_depth; d += packet_size) {
+                PacketBlock<Packet, 2> kernel0;
+                PacketBlock<Packet, 2> kernel1;
+                kernel0.packet[0] = rhs.packetNoPadding(d, idx0);
+                kernel0.packet[1] = rhs.packetNoPadding(d, idx1);
+                kernel1.packet[0] = rhs.packetNoPadding(d, idx2);
+                kernel1.packet[1] = rhs.packetNoPadding(d, idx3);
+                ptranspose(kernel0);
+                ptranspose(kernel1);
+                pstoreu(block + 0 * packet_size, kernel0.packet[0]);
+                pstoreu(block + 1 * packet_size, kernel1.packet[0]);
+                pstoreu(block + 2 * packet_size, kernel0.packet[1]);
+                pstoreu(block + 3 * packet_size, kernel1.packet[1]);
+                block += 4 * packet_size;
+                k += packet_size;
+              }
+
+              // Go to the next column.
+              continue;
+            }
+
+            // If we can't squeeze reads, process rows one by one.
             for (Index r = start_row; r < max_row; ++r) {
               eigen_assert(k <= peeled_k);
 
@@ -1097,8 +1208,8 @@ struct gemm_pack_rhs<
 };
 
 // Special case for non-vectorized types such as float16.
-template <typename NewDimension, DenseIndex Rows, DenseIndex Cols,
-          typename ArgType, typename Device, typename Scalar, typename Index,
+template <typename NewDimension, Index Rows, Index Cols, typename ArgType,
+          typename Device, typename Scalar, typename Index,
           typename nocontract_t, typename contract_t, bool inner_dim_contiguous,
           bool inner_dim_reordered, int Alignment, int nr>
 struct gemm_pack_rhs<
@@ -1170,6 +1281,218 @@ struct gemm_pack_rhs<
   }
 };
 
+#if defined(TENSORFLOW_USE_MKLDNN_CONTRACTION_KERNEL)
+// Arrange a block of the right input matrix (in our case it's always a
+// "virtual matrix" constructed from extracted image patches) in contiguous
+// memory.
+//
+// Mkldnn doesn't require Lhs/Rhs blocks to be packed in any specific format, so
+// this is basically the same as taking a slice of the matrix. Knowing
+// properties of the original patch op we can do it more efficient than default
+// mkldnn_gemm_pack.
+template <typename NewDimension, Index Rows, Index Cols, typename ArgType,
+          typename Device, typename Scalar, typename StorageIndex,
+          typename nocontract_t, typename contract_t, int packet_size,
+          bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment>
+struct mkldnn_gemm_pack<
+    Scalar, StorageIndex,
+    TensorContractionSubMapper<
+        Scalar, StorageIndex, Rhs,
+        TensorEvaluator<
+            const TensorReshapingOp<
+                NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType> >,
+            Device>,
+        nocontract_t, contract_t, packet_size, inner_dim_contiguous,
+        inner_dim_reordered, Alignment>,
+    ColMajor> {
+  typedef TensorContractionSubMapper<
+      Scalar, StorageIndex, Rhs,
+      TensorEvaluator<
+          const TensorReshapingOp<
+              NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType> >,
+          Device>,
+      nocontract_t, contract_t, packet_size, inner_dim_contiguous,
+      inner_dim_reordered, Alignment>
+      SubMapper;
+
+  typedef SubMapper DataMapper;
+  typedef typename packet_traits<Scalar>::type Packet;
+
+  EIGEN_DONT_INLINE
+  void operator()(Scalar* block, const DataMapper& rhs, StorageIndex rows,
+                  StorageIndex cols) {
+    const bool standard_patches = !rhs.nonStandardPatches();
+
+    if (standard_patches && (rhs.patchDepth() % packet_size == 0)) {
+      if (rhs.rowStride() == 1) {
+        packStandardPatches<true, /*squeeze*/ true>(block, rhs, rows, cols);
+      } else {
+        packStandardPatches<true, /*squeeze*/ false>(block, rhs, rows, cols);
+      }
+
+    } else if (standard_patches) {
+      if (rhs.rowStride() == 1) {
+        packStandardPatches<false, /*squeeze*/ true>(block, rhs, rows, cols);
+      } else {
+        packStandardPatches<false, /*squeeze*/ false>(block, rhs, rows, cols);
+      }
+
+    } else {
+      // With non-standard patches we don't do any vectorized loads.
+      // TODO(ezhulenev): It doesn't look like that we should completely give up
+      // on packets. Make this code path faster!
+      for (StorageIndex col = 0; col < cols; ++col) {
+        SubMapper lm = rhs.getLinearMapper(0, col);
+        for (StorageIndex i = 0; i < rows; ++i) {
+          *block = lm(i);
+          ++block;
+        }
+      }
+    }
+  }
+
+ private:
+  // Pack standard image patches:
+  //
+  // - patch_depth_is_multiple_of_packet_size=true: We are guaranteed to have
+  //   depth dimension size to be a multiple of packet size, so we can skip all
+  //   non vectorized loads and checks.
+  //
+  // - squeeze_reads=true: If stride along the `row` dimension is `1`, we can
+  //   squeeze reads along the `row` and `depth` dimensions, because they are
+  //   guaranteed to be contiguous in memory (two innermost dimensions).
+  //
+  template <bool patch_depth_is_multiple_of_packet_size, bool squeeze_reads>
+  EIGEN_ALWAYS_INLINE void packStandardPatches(Scalar* block,
+                                               const DataMapper& rhs,
+                                               StorageIndex rows,
+                                               StorageIndex cols) {
+    eigen_assert(!rhs.nonStandardPatches());
+
+    // Give vectorized_rows the name used in all other gemm_pack_rhs above.
+    const Index peeled_k = (rows / packet_size) * packet_size;
+
+    const Index start_col = rhs.colOffset();
+    const Index max_col = rhs.maxCol(peeled_k);
+
+    for (StorageIndex col = 0; col < cols; ++col) {
+      SubMapper lm = rhs.getLinearMapper(0, col);
+
+      Index k = 0;
+      for (Index c = start_col; c < max_col; ++c) {
+        eigen_assert(k <= peeled_k);
+
+        const Index start_row = (c == start_col) ? rhs.rowOffset() : 0;
+        const Index max_row = rhs.maxRow(peeled_k, c);
+        const bool pad_col = lm.padCol(c);
+
+        // We can squeeze reads for all rows in [start_row, max_row) range.
+        if (squeeze_reads && !pad_col && !lm.padRow(start_row) &&
+            !lm.padRow(max_row - 1)) {
+          const Index start_depth = (c == start_col) ? rhs.depthOffset() : 0;
+
+          // Upper bound on the number of elements in the depth dimension that
+          // we can squeeze read.
+          const Index squeeze_length =
+              (max_row - start_row) * rhs.patchDepth() - start_depth;
+
+          // Do not overshoot beyond the block size.
+          const Index max_depth =
+              start_depth + std::min<Index>(peeled_k - k, squeeze_length);
+
+          const Index base_idx = lm.baseIndex(start_row, c);
+
+          if (patch_depth_is_multiple_of_packet_size)
+            eigen_assert((max_depth - start_depth) % packet_size == 0);
+
+          // If patch depth is a multiple of packet size, it's guaranteed that
+          // we can process all values in depth dimension with packets.
+          const Index max_vectorized_depth =
+              patch_depth_is_multiple_of_packet_size ? max_depth
+                                                     : max_depth - packet_size;
+
+          Index d = start_depth;
+
+          // 1. Process depth dimension with vectorized instructions.
+          for (; d < max_vectorized_depth; d += packet_size) {
+            eigen_assert(k < peeled_k);
+            internal::pstoreu(block, rhs.packetNoPadding(d, base_idx));
+            block += packet_size;
+            k += packet_size;
+          }
+
+          // 2. Finish with coefficients.
+          if (!patch_depth_is_multiple_of_packet_size) {
+            for (; d < max_depth; d++) {
+              eigen_assert(k < peeled_k);
+              *block = rhs.coeffNoPadding(d, base_idx);
+              ++block;
+              ++k;
+            }
+          }
+
+          // Go to the next column.
+          continue;
+        }
+
+        // If we are not allowed to squeeze reads along the `row` and `depth`
+        // dimensions, we must process rows one by one.
+        for (Index r = start_row; r < max_row; ++r) {
+          eigen_assert(k <= peeled_k);
+
+          const Index start_depth =
+              ((c == start_col) && (r == start_row)) ? rhs.depthOffset() : 0;
+          const Index max_depth = rhs.maxDepth(peeled_k - k, start_depth);
+
+          const bool pad = pad_col || lm.padRow(r);
+          const Index base_idx = lm.baseIndex(r, c);
+
+          if (patch_depth_is_multiple_of_packet_size)
+            eigen_assert((max_depth - start_depth) % packet_size == 0);
+
+          // If patch depth is a multiple of packet size, it's guaranteed that
+          // we can process all values in depth dimension with packets.
+          const Index max_vectorized_depth =
+              patch_depth_is_multiple_of_packet_size ? max_depth
+                                                     : max_depth - packet_size;
+
+          Index d = start_depth;
+
+          // 1. Process depth dimension with vectorized instructions.
+          for (; d < max_vectorized_depth; d += packet_size) {
+            eigen_assert(k < peeled_k);
+            const Packet p = pad ? pset1<Packet>(Scalar(0))
+                                 : rhs.packetNoPadding(d, base_idx);
+            internal::pstoreu(block, p);
+            block += packet_size;
+            k += packet_size;
+          }
+
+          // 2. Finish with coefficients.
+          if (!patch_depth_is_multiple_of_packet_size) {
+            for (; d < max_depth; d++) {
+              eigen_assert(k < peeled_k);
+              *block = pad ? Scalar(0) : rhs.coeffNoPadding(d, base_idx);
+              ++block;
+              ++k;
+            }
+          }
+        }
+      }
+
+      // The loop above should fill peeled_k elements.
+      eigen_assert(peeled_k == k);
+
+      // Fill remaining elements using loadCoeffStandard.
+      for (; k < rows; ++k) {
+        *block = lm.loadCoeffStandard(k);
+        ++block;
+      }
+    }
+  }
+};
+#endif  // defined(TENSORFLOW_USE_MKLDNN_CONTRACTION_KERNEL)
+
 }  // end namespace internal
 
 /** SpatialConvolution
@@ -1195,8 +1518,12 @@ struct gemm_pack_rhs<
  * It is possible to swap the order of the width and height dimensions provided
  * that the same order is used in the input, the kernel, and the output.
  *
+ * It is also possible to add an output kernel to the contraction, output
+ * kernel is called by Eigen when it "finalizes" the block of an output tensor.
+ *
  */
-template <typename Input, typename Kernel>
+template <typename Input, typename Kernel,
+          typename OutputKernel = const NoOpOutputKernel>
 EIGEN_DEVICE_FUNC
     EIGEN_ALWAYS_INLINE static const typename internal::conditional<
         internal::traits<Input>::Layout == ColMajor,
@@ -1211,8 +1538,8 @@ EIGEN_DEVICE_FUNC
                     const Kernel>,
                 const TensorReshapingOp<
                     const DSizes<typename internal::traits<Input>::Index, 2>,
-                    const TensorImagePatchOp<Dynamic, Dynamic,
-                                             const Input> > > >,
+                    const TensorImagePatchOp<Dynamic, Dynamic, const Input> >,
+                const OutputKernel> >,
         TensorReshapingOp<
             const DSizes<typename internal::traits<Input>::Index,
                          internal::traits<Input>::NumDimensions>,
@@ -1224,13 +1551,14 @@ EIGEN_DEVICE_FUNC
                     const TensorImagePatchOp<Dynamic, Dynamic, const Input> >,
                 const TensorReshapingOp<
                     const DSizes<typename internal::traits<Input>::Index, 2>,
-                    const Kernel> > > >::type
+                    const Kernel>,
+                const OutputKernel> > >::type
     SpatialConvolution(const Input& input, const Kernel& kernel,
-                       const DenseIndex row_stride = 1,
-                       const DenseIndex col_stride = 1,
+                       const Index row_stride = 1, const Index col_stride = 1,
                        const PaddingType padding_type = PADDING_SAME,
-                       const DenseIndex row_in_stride = 1,
-                       const DenseIndex col_in_stride = 1) {
+                       const Index row_in_stride = 1,
+                       const Index col_in_stride = 1,
+                       const OutputKernel& output_kernel = OutputKernel()) {
   typedef typename internal::traits<Input>::Index TensorIndex;
   TensorRef<Tensor<typename internal::traits<Input>::Scalar,
                    internal::traits<Input>::NumDimensions,
@@ -1260,9 +1588,9 @@ EIGEN_DEVICE_FUNC
   const TensorIndex kernelCols =
       isColMajor ? kern.dimensions()[3] : kern.dimensions()[0];
 
-  const DenseIndex kernelRowsEff =
+  const Index kernelRowsEff =
       kernelRows + (kernelRows - 1) * (row_in_stride - 1);
-  const DenseIndex kernelColsEff =
+  const Index kernelColsEff =
       kernelCols + (kernelCols - 1) * (col_in_stride - 1);
 
   array<IndexPair<TensorIndex>, 1> contract_dims;
@@ -1353,13 +1681,13 @@ EIGEN_DEVICE_FUNC
                             kernelRows, kernelCols, row_stride, col_stride,
                             row_in_stride, col_in_stride, padding_type)
                         .reshape(pre_contract_dims),
-                    contract_dims)
+                    contract_dims, output_kernel)
           .reshape(post_contract_dims),
       input
           .extract_image_patches(kernelRows, kernelCols, row_stride, col_stride,
                                  row_in_stride, col_in_stride, padding_type)
           .reshape(pre_contract_dims)
-          .contract(kernel.reshape(kernel_dims), contract_dims)
+          .contract(kernel.reshape(kernel_dims), contract_dims, output_kernel)
           .reshape(post_contract_dims));
 }
 
diff --git a/tensorflow/core/kernels/eigen_spatial_convolutions_test.cc b/tensorflow/core/kernels/eigen_spatial_convolutions_test.cc
index b671421f5fd..8219fc9025b 100644
--- a/tensorflow/core/kernels/eigen_spatial_convolutions_test.cc
+++ b/tensorflow/core/kernels/eigen_spatial_convolutions_test.cc
@@ -1378,7 +1378,9 @@ static void PackRhsHelper(int iters,
                           int input_batches, int input_cols, int input_rows,
                           int input_depth,
                           /* Filter (kernel) dimensions: */
-                          int filter_count, int filter_cols, int filter_rows) {
+                          int filter_count, int filter_cols, int filter_rows,
+                          /* Input strides: */
+                          int col_strides, int row_strides) {
   tensorflow::testing::UseRealTime();
   tensorflow::testing::StopTiming();
 
@@ -1425,12 +1427,17 @@ static void PackRhsHelper(int iters,
       /*inner_dim_reordered*/ false,                  //
       /*Alignment*/ 0>;
 
+#if defined(TENSORFLOW_USE_MKLDNN_CONTRACTION_KERNEL)
+  using PackRhsImpl = Eigen::internal::mkldnn_gemm_pack<float, Eigen::Index,
+                                                        SubMapper, ColMajor>;
+#else
   using PackRhsImpl =
       Eigen::internal::gemm_pack_rhs<float, Eigen::Index, SubMapper,  //
                                      Traits::nr,                      //
                                      ColMajor,                        //
                                      /*Conjugate*/ false,             //
                                      /*PanelMode*/ false>;
+#endif
 
   Eigen::DefaultDevice device;
 
@@ -1463,16 +1470,18 @@ static void PackRhsHelper(int iters,
     const auto image_patch_op = TensorImagePatchOp<Dynamic, Dynamic, ArgType>(
         tensor_map,                                            //
         filter_rows, filter_cols,                              //
-        /*row_strides=*/1, /*col_strides=*/1,                  //
+        row_strides, col_strides,                              //
         /*in_row_strides=*/1, /*in_col_strides=*/1,            //
         /*row_inflate_strides=*/1, /*col_inflate_strides=*/1,  //
         Eigen::PADDING_SAME, /*padding_value=*/0.0);
 
     // 2. Reshape extracted patches into "virtual" 2d tensor.
-    // NOTE: for PADDING_SAME output {rows, cols} == input {rows, cols}.
+    // NOTE: This is valid for PADDING_SAME only.
+    Index output_rows = input_rows / row_strides;
+    Index output_cols = input_cols / col_strides;
     NewDimension reshape_dims;
-    reshape_dims[0] = input_depth * filter_rows * filter_cols;  // patch size
-    reshape_dims[1] = input_rows * input_cols * input_batches;  // num_patches
+    reshape_dims[0] = input_depth * filter_rows * filter_cols;    // patch size
+    reshape_dims[1] = output_rows * output_cols * input_batches;  // num_patches
 
     const auto reshape_op =
         TensorReshapingOp<NewDimension, decltype(image_patch_op)>(
@@ -1522,9 +1531,9 @@ static void PackRhsHelper(int iters,
     Index packed_offset =
         internal::random<Index>(0, packed_total_size - packed_size - 1);
 
-    pack_rhs(packed.data() + packed_offset,
-             input_mappers[input_idx].getSubMapper(depth_offset, col_offset),
-             depth, cols);
+    SubMapper sub_mapper =
+        input_mappers[input_idx].getSubMapper(depth_offset, col_offset);
+    pack_rhs(packed.data() + packed_offset, sub_mapper, depth, cols);
   }
   tensorflow::testing::StopTiming();
 
@@ -1535,14 +1544,14 @@ static void PackRhsHelper(int iters,
   tensorflow::testing::SetLabel(stringStream.str());
 }
 
-#define BM_NAME(prefix, N, H, W, C, FC, FH, FW) \
-  BM_##prefix##_##N##_##H##x##W##_IC##C##_FC##FC##_##FH##x##FW
+#define BM_NAME(prefix, N, H, W, C, FC, FH, FW, SH, SW) \
+  BM_##prefix##_##N##_##H##x##W##_IC##C##_FC##FC##_##FH##x##FW##_s##SH##x##SW
 
-#define BM_PackRhs(N, H, W, C, FC, FH, FW)                          \
-  static void BM_NAME(PackRhs, N, H, W, C, FC, FH, FW)(int iters) { \
-    PackRhsHelper(iters, N, H, W, C, FC, FH, FW);                   \
-  }                                                                 \
-  BENCHMARK(BM_NAME(PackRhs, N, H, W, C, FC, FH, FW))
+#define BM_PackRhs(N, H, W, C, FC, FH, FW, SH, SW)                          \
+  static void BM_NAME(PackRhs, N, H, W, C, FC, FH, FW, SH, SW)(int iters) { \
+    PackRhsHelper(iters, N, H, W, C, FC, FH, FW, SH, SW);                   \
+  }                                                                         \
+  BENCHMARK(BM_NAME(PackRhs, N, H, W, C, FC, FH, FW, SH, SW))
 
 // Number of input channel (input depth) it equal to the number of patch
 // channels (patch depth).
@@ -1553,13 +1562,28 @@ BM_PackRhs(/*batch*/ 32,        //
            /*image*/ 64, 64,    //
            /*channels*/ 32,     //
            /*num_filters*/ 64,  //
-           /*filter*/ 5, 5);
+           /*filter*/ 5, 5,     //
+           /*stride*/ 1, 1);
+
+BM_PackRhs(/*batch*/ 32,        //
+           /*image*/ 64, 64,    //
+           /*channels*/ 32,     //
+           /*num_filters*/ 64,  //
+           /*filter*/ 5, 5,     //
+           /*stride*/ 2, 2);
 
 // Slow path: input channel dimension is not the multiple of the packet size.
 BM_PackRhs(/*batch*/ 32,        //
            /*image*/ 64, 64,    //
            /*channels*/ 30,     //
            /*num_filters*/ 64,  //
-           /*filter*/ 5, 5);
+           /*filter*/ 5, 5,     //
+           /*stride*/ 1, 1);
 
+BM_PackRhs(/*batch*/ 32,        //
+           /*image*/ 64, 64,    //
+           /*channels*/ 30,     //
+           /*num_filters*/ 64,  //
+           /*filter*/ 5, 5,     //
+           /*stride*/ 2, 2);
 }  // namespace Eigen
diff --git a/tensorflow/core/kernels/function_ops.cc b/tensorflow/core/kernels/function_ops.cc
index a5374e02684..cca3cfbd7c0 100644
--- a/tensorflow/core/kernels/function_ops.cc
+++ b/tensorflow/core/kernels/function_ops.cc
@@ -69,6 +69,7 @@ void RetvalOp::Compute(OpKernelContext* ctx) {
 }
 
 REGISTER_SYSTEM_KERNEL_BUILDER(Name(kArgOp).Device(DEVICE_CPU), ArgOp);
+REGISTER_SYSTEM_KERNEL_BUILDER(Name(kDeviceArgOp).Device(DEVICE_CPU), ArgOp);
 REGISTER_SYSTEM_KERNEL_BUILDER(Name(kRetOp).Device(DEVICE_CPU), RetvalOp);
 REGISTER_SYSTEM_KERNEL_BUILDER(Name(kDeviceRetOp).Device(DEVICE_CPU), RetvalOp);
 
@@ -99,11 +100,14 @@ TF_CALL_bool(REGISTER) REGISTER_KERNEL_BUILDER(Name(kRetOp)
   REGISTER_KERNEL_BUILDER( \
       Name(kArgOp).Device(DEVICE_GPU).TypeConstraint<type>("T"), ArgOp);
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER)
+TF_CALL_QUANTIZED_TYPES(REGISTER)
 TF_CALL_bool(REGISTER) REGISTER_KERNEL_BUILDER(Name(kArgOp)
                                                    .Device(DEVICE_GPU)
                                                    .HostMemory("output")
                                                    .TypeConstraint<int32>("T"),
                                                ArgOp);
+REGISTER_KERNEL_BUILDER(
+    Name(kDeviceArgOp).Device(DEVICE_GPU).TypeConstraint<int32>("T"), ArgOp);
 #undef REGISTER
 
 REGISTER_KERNEL_BUILDER(Name(kArgOp)
@@ -122,6 +126,7 @@ REGISTER_KERNEL_BUILDER(Name(kArgOp)
   REGISTER_KERNEL_BUILDER( \
       Name(kRetOp).Device(DEVICE_GPU).TypeConstraint<type>("T"), RetvalOp);
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER)
+TF_CALL_QUANTIZED_TYPES(REGISTER)
 TF_CALL_bool(REGISTER) REGISTER_KERNEL_BUILDER(Name(kRetOp)
                                                    .Device(DEVICE_GPU)
                                                    .HostMemory("input")
diff --git a/tensorflow/core/kernels/function_ops.h b/tensorflow/core/kernels/function_ops.h
index 0f51eca1638..9ddd4956039 100644
--- a/tensorflow/core/kernels/function_ops.h
+++ b/tensorflow/core/kernels/function_ops.h
@@ -22,6 +22,7 @@ limitations under the License.
 namespace tensorflow {
 
 static const char* const kArgOp = FunctionLibraryDefinition::kArgOp;
+static const char* const kDeviceArgOp = FunctionLibraryDefinition::kDeviceArgOp;
 static const char* const kRetOp = FunctionLibraryDefinition::kRetOp;
 static const char* const kDeviceRetOp = FunctionLibraryDefinition::kDeviceRetOp;
 
diff --git a/tensorflow/core/kernels/fuzzing/BUILD b/tensorflow/core/kernels/fuzzing/BUILD
index 1a254da50e6..6f3a49805ce 100644
--- a/tensorflow/core/kernels/fuzzing/BUILD
+++ b/tensorflow/core/kernels/fuzzing/BUILD
@@ -37,8 +37,6 @@ tf_ops_fuzz_target_lib("decode_bmp")
 
 tf_ops_fuzz_target_lib("decode_png")
 
-tf_ops_fuzz_target_lib("decode_jpeg")
-
 tf_ops_fuzz_target_lib("decode_wav")
 
 tf_ops_fuzz_target_lib("example_proto_fast_parsing")
diff --git a/tensorflow/core/kernels/gemm_functors.h b/tensorflow/core/kernels/gemm_functors.h
index 1c808440851..97e077c0960 100644
--- a/tensorflow/core/kernels/gemm_functors.h
+++ b/tensorflow/core/kernels/gemm_functors.h
@@ -36,6 +36,10 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_types.h"
 
+#if defined(TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL)
+#include "tensorflow/core/kernels/eigen_contraction_kernel.h"
+#endif
+
 // Apple provides an optimized BLAS library that is better than Eigen for their
 // devices, so use that if possible.
 #if defined(__APPLE__) && defined(USE_GEMM_FOR_CONV)
diff --git a/tensorflow/core/kernels/list_kernels.cc b/tensorflow/core/kernels/list_kernels.cc
index 3d0c193d9fc..5f244b1b10f 100644
--- a/tensorflow/core/kernels/list_kernels.cc
+++ b/tensorflow/core/kernels/list_kernels.cc
@@ -42,7 +42,8 @@ typedef Eigen::ThreadPoolDevice CPUDevice;
 TensorList::TensorList(const TensorList& other)
     : tensors(other.tensors),
       element_shape(other.element_shape),
-      element_dtype(other.element_dtype) {}
+      element_dtype(other.element_dtype),
+      max_num_elements(other.max_num_elements) {}
 
 void TensorList::Encode(VariantTensorData* data) const {
   data->set_type_name(TypeName());
@@ -63,6 +64,7 @@ void TensorList::Encode(VariantTensorData* data) const {
     core::PutVarint64(&metadata, static_cast<uint64>(i));
   }
   core::PutVarint64(&metadata, static_cast<uint64>(element_dtype));
+  core::PutVarint64(&metadata, static_cast<uint64>(max_num_elements));
   TensorShapeProto element_shape_proto;
   element_shape.AsProto(&element_shape_proto);
   element_shape_proto.AppendToString(&metadata);
@@ -74,6 +76,7 @@ static Status TensorListDeviceCopy(
     const UnaryVariantOpRegistry::AsyncTensorDeviceCopyFn& copy) {
   to->element_shape = from.element_shape;
   to->element_dtype = from.element_dtype;
+  to->max_num_elements = from.max_num_elements;
   to->tensors.reserve(from.tensors.size());
   for (const Tensor& t : from.tensors) {
     Tensor tmp(t.dtype());
@@ -140,6 +143,8 @@ bool TensorList::Decode(const VariantTensorData& data) {
 
   core::GetVarint64(&iter, &scratch);
   element_dtype = static_cast<DataType>(scratch);
+  core::GetVarint64(&iter, &scratch);
+  max_num_elements = static_cast<int>(scratch);
   TensorShapeProto element_shape_proto;
   element_shape_proto.ParseFromString(string(iter.data(), iter.size()));
   element_shape = PartialTensorShape(element_shape_proto);
@@ -175,12 +180,19 @@ class EmptyTensorList : public OpKernel {
   }
 
   void Compute(OpKernelContext* ctx) override {
+    const Tensor& max_num_elements_t = ctx->input(1);
+    OP_REQUIRES(
+        ctx, TensorShapeUtils::IsScalar(max_num_elements_t.shape()),
+        errors::InvalidArgument(
+            "max_num_elements expected to be a scalar ",
+            "but got shape: ", max_num_elements_t.shape().DebugString()));
     Tensor* result;
     AllocatorAttributes attr;
     attr.set_on_host(true);
     OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape{}, &result, attr));
     TensorList empty;
     empty.element_dtype = element_dtype_;
+    empty.max_num_elements = max_num_elements_t.scalar<int32>()();
     PartialTensorShape element_shape;
     OP_REQUIRES_OK(ctx, TensorShapeFromTensor(ctx->input(0), &element_shape));
     empty.element_shape = element_shape;
@@ -198,9 +210,11 @@ REGISTER_KERNEL_BUILDER(Name("EmptyTensorList").Device(DEVICE_CPU),
 
 #if GOOGLE_CUDA
 
-REGISTER_KERNEL_BUILDER(
-    Name("EmptyTensorList").Device(DEVICE_GPU).HostMemory("element_shape"),
-    EmptyTensorList);
+REGISTER_KERNEL_BUILDER(Name("EmptyTensorList")
+                            .Device(DEVICE_GPU)
+                            .HostMemory("element_shape")
+                            .HostMemory("max_num_elements"),
+                        EmptyTensorList);
 
 #endif  // GOOGLE_CUDA
 
@@ -237,6 +251,14 @@ class TensorListPushBack : public OpKernel {
                                         " but list elements ",
                                         DataTypeString(l->element_dtype)));
 
+    if (l->max_num_elements != -1) {
+      OP_REQUIRES(
+          c, l->tensors.size() < l->max_num_elements,
+          errors::InvalidArgument("Tried to push item into a full list",
+                                  " list size: ", l->tensors.size(),
+                                  " max_num_elements: ", l->max_num_elements));
+    }
+
     TensorList output;
     output = *l;
     output.tensors.push_back(input);
diff --git a/tensorflow/core/kernels/list_kernels.h b/tensorflow/core/kernels/list_kernels.h
index 12581b15b1c..c2591f53141 100644
--- a/tensorflow/core/kernels/list_kernels.h
+++ b/tensorflow/core/kernels/list_kernels.h
@@ -56,6 +56,9 @@ struct TensorList {
   std::vector<Tensor> tensors;
   PartialTensorShape element_shape;
   DataType element_dtype;
+  // The maximum allowed size of `tensors`. Defaults to -1 meaning that the size
+  // of `tensors` is unbounded.
+  int max_num_elements = -1;
 };
 
 Status TensorShapeFromTensor(const Tensor& t, PartialTensorShape* out);
diff --git a/tensorflow/core/kernels/lrn_op.cc b/tensorflow/core/kernels/lrn_op.cc
index b4252eb0444..f405ca3c58c 100644
--- a/tensorflow/core/kernels/lrn_op.cc
+++ b/tensorflow/core/kernels/lrn_op.cc
@@ -26,6 +26,10 @@ limitations under the License.
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/errors.h"
 
+#if defined(TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL)
+#include "tensorflow/core/kernels/eigen_contraction_kernel.h"
+#endif
+
 #if !defined(IS_MOBILE_PLATFORM)
 #include "tensorflow/core/util/work_sharder.h"
 #endif
diff --git a/tensorflow/core/kernels/matmul_op.h b/tensorflow/core/kernels/matmul_op.h
index 4b74a64025a..48769f3fe5d 100644
--- a/tensorflow/core/kernels/matmul_op.h
+++ b/tensorflow/core/kernels/matmul_op.h
@@ -21,6 +21,10 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/lib/hash/hash.h"
 
+#if defined(TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL)
+#include "tensorflow/core/kernels/eigen_contraction_kernel.h"
+#endif
+
 namespace tensorflow {
 namespace functor {
 
diff --git a/tensorflow/core/kernels/mkl_conv_ops.cc b/tensorflow/core/kernels/mkl_conv_ops.cc
index b332edad0ae..14d134e2d0c 100644
--- a/tensorflow/core/kernels/mkl_conv_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_ops.cc
@@ -17,9 +17,9 @@ limitations under the License.
 #ifdef INTEL_MKL
 
 #include <string.h>
+#include <algorithm>
 #include <map>
 #include <vector>
-#include <memory>
 
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -29,6 +29,8 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_slice.h"
 #include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/mkl_conv_ops.h"
+#include "tensorflow/core/kernels/mkl_quantized_conv_ops.h"
+#include "tensorflow/core/kernels/no_op.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
@@ -69,6 +71,12 @@ struct MklConvFwdParams {
   memory::dims dilations;
   memory::dims padding_left;
   memory::dims padding_right;
+  string dtypes = string("");
+  struct PostOpParam {
+    string name;
+    std::vector<float> param;
+  };
+  std::vector<PostOpParam> post_op_params;
 
   MklConvFwdParams(memory::dims src_dims, memory::dims filter_dims,
                    memory::dims bias_dims, memory::dims dst_dims,
@@ -83,8 +91,10 @@ struct MklConvFwdParams {
         padding_left(padding_left),
         padding_right(padding_right) {}
 };
-
-template <typename T>
+// With quantization, input, filter, and output can have different types
+// so we use differnt template parameter for each type
+template <typename T, typename Tinput, typename Tfilter, typename Tbias,
+          typename Toutput>
 class MklConvFwdPrimitive : public MklPrimitive {
  public:
   explicit MklConvFwdPrimitive(const MklConvFwdParams& convFwdDims)
@@ -103,16 +113,16 @@ class MklConvFwdPrimitive : public MklPrimitive {
   //   filter_data: input data buffer of filter (weights)
   //   bias_data:   input data buffer of bias
   //   dst_data:    output data buffer of dst
-  void Execute(const T* src_data, const T* filter_data, const T* bias_data,
-               const T* dst_data) {
+  void Execute(const Tinput* src_data, const Tfilter* filter_data,
+               const Tbias* bias_data, const Toutput* dst_data) {
     context_.src_mem->set_data_handle(
-        static_cast<void*>(const_cast<T*>(src_data)));
+        static_cast<void*>(const_cast<Tinput*>(src_data)));
     context_.filter_mem->set_data_handle(
-        static_cast<void*>(const_cast<T*>(filter_data)));
+        static_cast<void*>(const_cast<Tfilter*>(filter_data)));
     context_.bias_mem->set_data_handle(
-        static_cast<void*>(const_cast<T*>(bias_data)));
+        static_cast<void*>(const_cast<Tbias*>(bias_data)));
     context_.dst_mem->set_data_handle(
-        static_cast<void*>(const_cast<T*>(dst_data)));
+        static_cast<void*>(const_cast<Toutput*>(dst_data)));
     context_.fwd_stream->submit(context_.fwd_primitives);
 
     // after exec, set data handle back
@@ -128,13 +138,14 @@ class MklConvFwdPrimitive : public MklPrimitive {
   //   src_data:    input data buffer of src
   //   filter_data: input data buffer of filter (weights)
   //   dst_data:    output data buffer of dst
-  void Execute(const T* src_data, const T* filter_data, const T* dst_data) {
+  void Execute(const Tinput* src_data, const Tfilter* filter_data,
+               const Toutput* dst_data) {
     context_.src_mem->set_data_handle(
-        static_cast<void*>(const_cast<T*>(src_data)));
+        static_cast<void*>(const_cast<Tinput*>(src_data)));
     context_.filter_mem->set_data_handle(
-        static_cast<void*>(const_cast<T*>(filter_data)));
+        static_cast<void*>(const_cast<Tfilter*>(filter_data)));
     context_.dst_mem->set_data_handle(
-        static_cast<void*>(const_cast<T*>(dst_data)));
+        static_cast<void*>(const_cast<Toutput*>(dst_data)));
     context_.fwd_stream->submit(context_.fwd_primitives);
 
     // after execution, set data handle back
@@ -200,17 +211,17 @@ class MklConvFwdPrimitive : public MklPrimitive {
   void Setup(const MklConvFwdParams& convFwdDims) {
     // create memory descriptors for convolution data w/ no specified format
     context_.src_md.reset(new memory::desc(
-        {convFwdDims.src_dims}, MklDnnType<T>(), memory::format::any));
+        {convFwdDims.src_dims}, MklDnnType<Tinput>(), memory::format::any));
 
     context_.filter_md.reset(new memory::desc(
-        {convFwdDims.filter_dims}, MklDnnType<T>(), memory::format::any));
+        {convFwdDims.filter_dims}, MklDnnType<Tfilter>(), memory::format::any));
 
     context_.dst_md.reset(new memory::desc(
-        {convFwdDims.dst_dims}, MklDnnType<T>(), memory::format::any));
+        {convFwdDims.dst_dims}, MklDnnType<Toutput>(), memory::format::any));
 
     if (!convFwdDims.bias_dims.empty())
       context_.bias_md.reset(new memory::desc(
-          {convFwdDims.bias_dims}, MklDnnType<T>(), memory::format::any));
+          {convFwdDims.bias_dims}, MklDnnType<Tbias>(), memory::format::any));
 
     // create a convolution
     if (!convFwdDims.bias_dims.empty()) {
@@ -230,6 +241,42 @@ class MklConvFwdPrimitive : public MklPrimitive {
     context_.fwd_pd.reset(new convolution_forward::primitive_desc(
         *context_.fwd_desc, cpu_engine_));
 
+    // Check if there is any fusions as post-ops
+    auto const& post_op_params = convFwdDims.post_op_params;
+    mkldnn::primitive_attr post_ops_attr;
+    mkldnn::post_ops post_ops;
+    if (!post_op_params.empty()) {
+      for (auto const& post_op_param : post_op_params) {
+        if (post_op_param.name == "relu") {
+          DCHECK_EQ(post_op_param.param.size(), 3);
+          float op_scale = post_op_param.param[0];
+          float op_alpha = post_op_param.param[1];
+          float op_beta = post_op_param.param[2];
+          post_ops.append_eltwise(op_scale, mkldnn::eltwise_relu, op_alpha,
+                                  op_beta);
+        } else if (post_op_param.name == "sum") {
+          DCHECK_EQ(post_op_param.param.size(), 1);
+          float op_scale = post_op_param.param[0];
+          post_ops.append_sum(op_scale);
+        } else if (post_op_param.name == "output_scale") {
+          DCHECK_EQ(post_op_param.param.size(), 1);
+          std::vector<float> scales;
+          scales.push_back(post_op_param.param[0]);
+          post_ops_attr.set_output_scales(0, scales);
+        } else {
+          DCHECK((post_op_param.name == "relu") ||
+                 (post_op_param.name == "sum") ||
+                 (post_op_param.name == "output_scale"));
+        }
+      }
+      post_ops_attr.set_post_ops(post_ops);
+      context_.fwd_pd.reset(new convolution_forward::primitive_desc(
+          *context_.fwd_desc, post_ops_attr, cpu_engine_));
+    } else {
+      context_.fwd_pd.reset(new convolution_forward::primitive_desc(
+          *context_.fwd_desc, cpu_engine_));
+    }
+
     // store the expected memory format
     context_.src_fmt = static_cast<mkldnn::memory::format>(
         context_.fwd_pd.get()->src_primitive_desc().desc().data.format);
@@ -268,23 +315,30 @@ class MklConvFwdPrimitive : public MklPrimitive {
   engine cpu_engine_;
 };
 
-template <typename T>
+template <typename T, typename Tinput, typename Tfilter, typename Tbias,
+          typename Toutput>
 class MklConvFwdPrimitiveFactory : public MklPrimitiveFactory<T> {
  public:
-  static MklConvFwdPrimitive<T>* Get(const MklConvFwdParams& convFwdDims,
-                                     bool do_not_cache) {
-    MklConvFwdPrimitive<T>* conv_fwd = nullptr;
+  static MklConvFwdPrimitive<T, Tinput, Tfilter, Tbias, Toutput>* Get(
+      const MklConvFwdParams& convFwdDims, bool do_not_cache) {
+    MklConvFwdPrimitive<T, Tinput, Tfilter, Tbias, Toutput>* conv_fwd = nullptr;
 
     if (do_not_cache) { /* Always create new primitive */
-      conv_fwd = new MklConvFwdPrimitive<T>(convFwdDims);
+      conv_fwd = new MklConvFwdPrimitive<T, Tinput, Tfilter, Tbias, Toutput>(
+          convFwdDims);
     } else {
       // try to find a suitable one in pool
-      conv_fwd = dynamic_cast<MklConvFwdPrimitive<T>*>(
-          MklConvFwdPrimitiveFactory<T>::GetInstance().GetConvFwd(convFwdDims));
+      conv_fwd = dynamic_cast<
+          MklConvFwdPrimitive<T, Tinput, Tfilter, Tbias, Toutput>*>(
+          MklConvFwdPrimitiveFactory<T, Tinput, Tfilter, Tbias,
+                                     Toutput>::GetInstance()
+              .GetConvFwd(convFwdDims));
       if (conv_fwd == nullptr) {
-        conv_fwd = new MklConvFwdPrimitive<T>(convFwdDims);
-        MklConvFwdPrimitiveFactory<T>::GetInstance().SetConvFwd(convFwdDims,
-                                                                conv_fwd);
+        conv_fwd = new MklConvFwdPrimitive<T, Tinput, Tfilter, Tbias, Toutput>(
+            convFwdDims);
+        MklConvFwdPrimitiveFactory<T, Tinput, Tfilter, Tbias,
+                                   Toutput>::GetInstance()
+            .SetConvFwd(convFwdDims, conv_fwd);
       }
     }
 
@@ -314,6 +368,29 @@ class MklConvFwdPrimitiveFactory : public MklPrimitiveFactory<T> {
     key_creator.AddAsKey(convFwdDims.dilations);
     key_creator.AddAsKey(convFwdDims.padding_left);
     key_creator.AddAsKey(convFwdDims.padding_right);
+    key_creator.AddAsKey(convFwdDims.dtypes);
+
+    // Generate keys for post-ops
+    for (auto const& post_op_param : convFwdDims.post_op_params) {
+      if (post_op_param.name == "relu") {
+        DCHECK_EQ(post_op_param.param.size(), 3);
+        key_creator.AddAsKey(post_op_param.name);
+        key_creator.AddAsKey(post_op_param.param[0]);
+        key_creator.AddAsKey(post_op_param.param[1]);
+        key_creator.AddAsKey(post_op_param.param[2]);
+      } else if (post_op_param.name == "sum") {
+        DCHECK_EQ(post_op_param.param.size(), 1);
+        key_creator.AddAsKey(post_op_param.name);
+        key_creator.AddAsKey(post_op_param.param[0]);
+      } else if (post_op_param.name == "output_scale") {
+        DCHECK_EQ(post_op_param.param.size(), 1);
+        key_creator.AddAsKey(post_op_param.name);
+        key_creator.AddAsKey(post_op_param.param[0]);
+      } else {
+        return string("not_a_key");
+      }
+    }
+
     return key_creator.GetKey();
   }
 
@@ -757,10 +834,23 @@ class MklConvOp : public OpKernel {
   TensorFormat data_format_;
 };
 
+// FP32 kernel registration for INTEL_MKL_ML
+REGISTER_KERNEL_BUILDER(Name("_MklConv2D")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<float>("T")
+                            .Label(mkl_op_registry::kMklOpLabel),
+                        MklConv2DOp<CPUDevice, float, false>);
+REGISTER_KERNEL_BUILDER(Name("_MklConv2DWithBias")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<float>("T")
+                            .Label(mkl_op_registry::kMklOpLabel),
+                        MklConv2DOp<CPUDevice, float, true>);
+
 #else
 
 // Base class for convolution forward operations
-template <typename Device, typename T, bool biasEnabled>
+template <typename Device, typename Tinput, typename Tfilter, typename Tbias,
+          typename Toutput, typename Ttemp_output, bool biasEnabled>
 class MklConvOp : public OpKernel {
  public:
   ~MklConvOp() {}
@@ -828,25 +918,25 @@ class MklConvOp : public OpKernel {
       GetMklShape(context, kInputIndex_Src, &src_mkl_shape);
       GetMklShape(context, kInputIndex_Filter, &filter_mkl_shape);
       OP_REQUIRES(context, filter_mkl_shape.IsMklTensor() == false,
-            errors::InvalidArgument("Filter should not be in "
-            "Mkl Layout"));
+                  errors::InvalidArgument("Filter should not be in "
+                                          "Mkl Layout"));
 
-      MklDnnData<T> src(&cpu_engine);
-      MklDnnData<T> filter(&cpu_engine);
+      MklDnnData<Tinput> src(&cpu_engine_);
+      MklDnnData<Tfilter> filter(&cpu_engine_);
 
       memory::dims src_dims, filter_dims, padding_left, padding_right,
-                   dilations, strides;
+          dilations, strides;
       memory::dims dst_dims_tf_order, dst_dims_mkl_order;
 
       // Get shapes of input tensors in MKL-DNN order
       MklDnnConvUtil conv_utl(context, strides_, padding_, data_format_,
-                             dilations_);
+                              dilations_);
       auto src_tf_shape = GetTfShape(context, kInputIndex_Src);
       auto filter_tf_shape = GetTfShape(context, kInputIndex_Filter);
       conv_utl.GetConvFwdSizesInMklOrder(
-          src_tf_shape, filter_tf_shape, &src_dims, &filter_dims,
-          &strides, &dilations, &dst_dims_tf_order, &dst_dims_mkl_order,
-          &padding_left, &padding_right);
+          src_tf_shape, filter_tf_shape, &src_dims, &filter_dims, &strides,
+          &dilations, &dst_dims_tf_order, &dst_dims_mkl_order, &padding_left,
+          &padding_right);
       if (!context->status().ok()) return;
 
       // Check for corner case - if there is nothing to compute, return.
@@ -854,20 +944,25 @@ class MklConvOp : public OpKernel {
 
       // Corner cases: output with 0 elements and 0 batch size.
       Tensor* dst_tensor = nullptr;
-      if (dst_tf_shape.num_elements() == 0 ||
-          dst_dims_tf_order[0] == 0) {
+      if (dst_tf_shape.num_elements() == 0 || dst_dims_tf_order[0] == 0) {
         MklDnnShape dst_mkl_shape;
         dst_mkl_shape.SetMklTensor(false);
-        AllocateOutputSetMklShape(context, kOutputIndex_Dst,
-                    &dst_tensor, src_tf_shape, dst_mkl_shape);
+        AllocateOutputSetMklShape(context, kOutputIndex_Dst, &dst_tensor,
+                                  src_tf_shape, dst_mkl_shape);
 
         // MklConv2D/3D also outputs converted filter
         // as 2nd output of Conv2D/3D.
         filter_mkl_shape.SetMklTensor(false);
         Tensor* output_filter_tensor = nullptr;
-        AllocateOutputSetMklShape(context, kOutputIndex_Filter,
-                                  &output_filter_tensor,
-                                  filter_tf_shape, filter_mkl_shape);
+        // MklConv2D also outputs converted filter as 2nd output.
+        if (typeid(Tinput) == typeid(float) &&
+            typeid(Tfilter) == typeid(float) &&
+            typeid(Toutput) == typeid(float)) {
+          filter_mkl_shape.SetMklTensor(false);
+          AllocateOutputSetMklShape(context, kOutputIndex_Filter,
+                                    &output_filter_tensor, filter_tf_shape,
+                                    filter_mkl_shape);
+        }
         return;
       }
 
@@ -887,15 +982,17 @@ class MklConvOp : public OpKernel {
       //     Conv3D: NDHWC or NCDHW
       auto src_md = src_mkl_shape.IsMklTensor()
                         ? src_mkl_shape.GetMklLayout()
-                        : memory::desc(src_dims, MklDnnType<T>(), tf_fmt);
+                        : memory::desc(src_dims, MklDnnType<Tinput>(), tf_fmt);
+      src.SetUsrMem(src_md, &src_tensor);
 
       // Although filter shape (filter_dims) required is in MKL-DNN order,
       // the layout is Tensorflow's layout (HWIO).
       auto filter_md = filter_mkl_shape.IsMklTensor()  // Should NEVER be true
                            ? filter_mkl_shape.GetMklLayout()
-                           : memory::desc(filter_dims, MklDnnType<T>(),
+                           : memory::desc(filter_dims, MklDnnType<Tfilter>(),
                                           isConv2D ? memory::format::hwio
                                                    : memory::format::dhwio);
+      filter.SetUsrMem(filter_md, &filter_tensor);
       // MKLDNN dilation starts from 0.
       for (int i = 0; i < dilations.size(); i++) dilations[i] -= 1;
 
@@ -905,67 +1002,85 @@ class MklConvOp : public OpKernel {
       // in the following cases
       //   1. Legacy CPU without AVX512/AVX2, or
       //   2. 1x1 convolution with stride != 1
-      bool do_not_cache = MklPrimitiveFactory<T>::IsPrimitiveMemOptEnabled() &&
-                    (src_dims[MklDnnDims::Dim_N] > kSmallBatchSize) &&
-                    (MklPrimitiveFactory<T>::IsLegacyPlatform() ||
-                     IsConv1x1StrideNot1(filter_dims, strides));
+      bool do_not_cache =
+          MklPrimitiveFactory<Tinput>::IsPrimitiveMemOptEnabled() &&
+          (src_dims[MklDnnDims::Dim_N] > kSmallBatchSize) &&
+          (MklPrimitiveFactory<Tinput>::IsLegacyPlatform() ||
+           IsConv1x1StrideNot1(filter_dims, strides));
 
       // get a conv2d fwd from primitive pool
-      MklConvFwdPrimitive<T>* conv_fwd = nullptr;
+      MklConvFwdPrimitive<float, Tinput, Tfilter, Tbias, Ttemp_output>*
+          conv_fwd = nullptr;
       if (biasEnabled) {
         memory::dims bias_dims = {};
         conv_utl.GetBiasSizeInMklOrder(kInputIndex_Bias, &bias_dims);
         MklConvFwdParams convFwdDims(src_dims, filter_dims, bias_dims,
                                      dst_dims_mkl_order, strides, dilations,
                                      padding_left, padding_right);
-        conv_fwd = MklConvFwdPrimitiveFactory<T>::Get(
-            convFwdDims, do_not_cache);
+
+        // TODO(mdfaijul):  Extend the basic parameters for data types and
+        // fusions
+        this->ExtendConvFwdParams(context, convFwdDims);
+
+        conv_fwd = MklConvFwdPrimitiveFactory<float, Tinput, Tfilter, Tbias,
+                                              Ttemp_output>::Get(convFwdDims,
+                                                                 do_not_cache);
       } else {
         MklConvFwdParams convFwdDims(src_dims, filter_dims, NONE_DIMS,
                                      dst_dims_mkl_order, strides, dilations,
                                      padding_left, padding_right);
-        conv_fwd = MklConvFwdPrimitiveFactory<T>::Get(
-            convFwdDims, do_not_cache);
+
+        // Extend the basic parameters for data types and fusions
+        this->ExtendConvFwdParams(context, convFwdDims);
+
+        conv_fwd = MklConvFwdPrimitiveFactory<float, Tinput, Tfilter, Tbias,
+                                              Ttemp_output>::Get(convFwdDims,
+                                                                 do_not_cache);
       }
 
       // allocate output tensors output_tensor and filter_out_tensor
       std::shared_ptr<mkldnn::convolution_forward::primitive_desc> conv_fwd_pd =
           conv_fwd->GetPrimitiveDesc();
-      AllocateOutputTensor(context, *conv_fwd_pd,
-                       dst_dims_mkl_order, tf_fmt, &dst_tensor);
+      AllocateOutputTensor(context, *conv_fwd_pd, dst_dims_mkl_order, tf_fmt,
+                           &dst_tensor);
       Tensor* filter_out_tensor = nullptr;
-      AllocateFilterOutputTensor(context, *conv_fwd_pd,
-                                 TFShapeToMklDnnDims(filter_tf_shape),
-                                 &filter_out_tensor);
+      if (typeid(Tinput) == typeid(float) && typeid(Tfilter) == typeid(float) &&
+          typeid(Toutput) == typeid(float)) {
+        AllocateFilterOutputTensor(context, *conv_fwd_pd,
+                                   TFShapeToMklDnnDims(filter_tf_shape),
+                                   &filter_out_tensor);
+      }
 
-      T* dst_data = static_cast<T*>(dst_tensor->flat<T>().data());
+      Ttemp_output* dst_data =
+          reinterpret_cast<Ttemp_output*>(dst_tensor->flat<Toutput>().data());
 
       // check whether src/filter need reorder
-      T *src_data = nullptr;
+      Tinput* src_data = nullptr;
       if (src_md.data.format != conv_fwd->GetSrcMemoryFormat()) {
         src.SetUsrMem(src_md, &src_tensor);
         src.CheckReorderToOpMem(conv_fwd_pd.get()->src_primitive_desc());
-        src_data = static_cast<T*>(src.GetOpMem().get_data_handle());
+        src_data = static_cast<Tinput*>(src.GetOpMem().get_data_handle());
       } else {
-        src_data = static_cast<T*>(const_cast<T*>(src_tensor.flat<T>().data()));
+        src_data = static_cast<Tinput*>(
+            const_cast<Tinput*>(src_tensor.flat<Tinput>().data()));
       }
-      T* filter_data = nullptr;
+      Tfilter* filter_data = nullptr;
       if (filter_md.data.format != conv_fwd->GetFilterMemoryFormat()) {
         filter.SetUsrMem(filter_md, &filter_tensor);
         filter.CheckReorderToOpMem(conv_fwd_pd.get()->weights_primitive_desc(),
                                    filter.GetTensorBuffer(filter_out_tensor));
-        filter_data = static_cast<T*>(filter.GetOpMem().get_data_handle());
-      } else {
         filter_data =
-            static_cast<T*>(const_cast<T*>(filter_tensor.flat<T>().data()));
+            static_cast<Tfilter*>(filter.GetOpMem().get_data_handle());
+      } else {
+        filter_data = static_cast<Tfilter*>(
+            const_cast<Tfilter*>(filter_tensor.flat<Tfilter>().data()));
       }
 
       // execute convolution
       if (biasEnabled) {
         const Tensor& bias_tensor = MklGetInput(context, kInputIndex_Bias);
-        T* bias_data = static_cast<T*>(const_cast<T*>(
-            bias_tensor.flat<T>().data()));
-
+        Tbias* bias_data =
+            this->GetBiasHandle(context, conv_fwd_pd, bias_tensor);
         conv_fwd->Execute(src_data, filter_data, bias_data, dst_data);
       } else {
         conv_fwd->Execute(src_data, filter_data, dst_data);
@@ -973,15 +1088,72 @@ class MklConvOp : public OpKernel {
 
       // delete primitive since it is not cached.
       if (do_not_cache) delete conv_fwd;
-    } catch (mkldnn::error &e) {
+    } catch (mkldnn::error& e) {
       string error_msg = tensorflow::strings::StrCat(
           "Status: ", e.status, ", message: ", string(e.message), ", in file ",
           __FILE__, ":", __LINE__);
-      OP_REQUIRES_OK(context,
-        errors::Aborted("Operation received an exception:", error_msg));
+      OP_REQUIRES_OK(
+          context,
+          errors::Aborted("Operation received an exception:", error_msg));
     }
   }
 
+ protected:
+  virtual void ExtendConvFwdParams(OpKernelContext* context,
+                                   MklConvFwdParams& params) {
+    // Create a string from data types of input, filter, bias, and output.
+    params.dtypes.append(typeid(Tinput).name());
+    params.dtypes.append(typeid(Tfilter).name());
+    params.dtypes.append(typeid(Tbias).name());
+    params.dtypes.append(typeid(Toutput).name());
+  }
+
+  virtual Tbias* GetBiasHandle(
+      OpKernelContext* context,
+      std::shared_ptr<mkldnn::convolution_forward::primitive_desc>&
+          conv2d_fwd_pd,
+      const Tensor& bias_tensor) {
+    if (biasEnabled) {
+      return static_cast<Tbias*>(
+          const_cast<Tbias*>(bias_tensor.flat<Tbias>().data()));
+    } else {
+      return nullptr;
+    }
+  }
+
+  // Allocate output tensor.
+  virtual void AllocateOutputTensor(
+      OpKernelContext* context,
+      const convolution_forward::primitive_desc& conv_prim_desc,
+      const memory::dims& output_dims_mkl_order,
+      memory::format output_tf_format, Tensor** output_tensor) {
+    CHECK_NOTNULL(output_tensor);
+    auto dst_pd = conv_prim_desc.dst_primitive_desc();
+
+    auto dst_md = dst_pd.desc();
+    if (!std::is_same<Ttemp_output, Toutput>::value) {
+      dst_md.data.data_type =
+          static_cast<mkldnn_data_type_t>(MklDnnType<Toutput>());
+      dst_pd = memory::primitive_desc(dst_md, cpu_engine_);
+    }
+    // Allocate shape of Mkl tensor.
+    MklDnnShape output_mkl_shape;
+    output_mkl_shape.SetMklTensor(true);
+    output_mkl_shape.SetMklLayout(&dst_pd);
+    output_mkl_shape.SetElemType(MklDnnType<Toutput>());
+    output_mkl_shape.SetTfLayout(output_dims_mkl_order.size(),
+                                 output_dims_mkl_order, output_tf_format);
+
+    // Allocate shape of TF tensor.
+    TensorShape output_tf_shape;
+    output_tf_shape.AddDim((dst_pd.get_size() / sizeof(Toutput)));
+
+    AllocateOutputSetMklShape(context, kOutputIndex_Dst, output_tensor,
+                              output_tf_shape, output_mkl_shape);
+  }
+
+  engine cpu_engine_ = engine(engine::cpu, 0);
+
  private:
   std::vector<int32> strides_;
   std::vector<int32> dilations_;
@@ -990,34 +1162,8 @@ class MklConvOp : public OpKernel {
   const int kInputIndex_Src = 0, kInputIndex_Filter = 1, kInputIndex_Bias = 2;
   const int kOutputIndex_Dst = 0, kOutputIndex_Filter = 1;
   const int kDilationH = 0, kDilationW = 1;
-  engine cpu_engine = engine(engine::cpu, 0);
 
-  // Allocate output tensor.
-  void AllocateOutputTensor(
-      OpKernelContext* context,
-      const convolution_forward::primitive_desc& conv_prim_desc,
-      const memory::dims& output_dims_mkl_order,
-      memory::format output_tf_format, Tensor** output_tensor) {
-    CHECK_NOTNULL(output_tensor);
-    auto dst_pd = conv_prim_desc.dst_primitive_desc();
-
-    // Allocate shape of Mkl tensor.
-    MklDnnShape output_mkl_shape;
-    output_mkl_shape.SetMklTensor(true);
-    output_mkl_shape.SetMklLayout(&dst_pd);
-    output_mkl_shape.SetElemType(MklDnnType<T>());
-    output_mkl_shape.SetTfLayout(output_dims_mkl_order.size(),
-                                 output_dims_mkl_order, output_tf_format);
-
-    // Allocate shape of TF tensor.
-    TensorShape output_tf_shape;
-    output_tf_shape.AddDim((dst_pd.get_size() / sizeof(T)));
-
-    AllocateOutputSetMklShape(context, kOutputIndex_Dst, output_tensor,
-                              output_tf_shape, output_mkl_shape);
-  }
-
-  // Allocate output tensor.
+  // Allocate filter output tensor.
   void AllocateFilterOutputTensor(
       OpKernelContext* context,
       const convolution_forward::primitive_desc& conv_prim_desc,
@@ -1029,7 +1175,7 @@ class MklConvOp : public OpKernel {
     MklDnnShape filter_mkl_shape;
     filter_mkl_shape.SetMklTensor(true);
     filter_mkl_shape.SetMklLayout(&filter_pd);
-    filter_mkl_shape.SetElemType(MklDnnType<T>());
+    filter_mkl_shape.SetElemType(MklDnnType<Tfilter>());
 
     // The format of the filter is actually OIhw8i8o, but TF doesn't support
     // this format. Just use format::blocked for now because the layout
@@ -1039,17 +1185,17 @@ class MklConvOp : public OpKernel {
 
     // Allocate the data space for the filter to propagate as TF tensor.
     TensorShape filter_tf_shape;
-    filter_tf_shape.AddDim((filter_pd.get_size() / sizeof(T)));
+    filter_tf_shape.AddDim((filter_pd.get_size() / sizeof(Tfilter)));
 
     AllocateOutputSetMklShape(context, kOutputIndex_Filter, filter_tensor,
                               filter_tf_shape, filter_mkl_shape);
   }
-
   // Prepare and execute net - checks for input and output reorders.
   void PrepareAndExecuteNet(
       const convolution_forward::primitive_desc& conv_prim_desc,
-      MklDnnData<T>* src, MklDnnData<T>* filter, MklDnnData<T>* bias,
-      MklDnnData<T>* output, Tensor* filter_out_tensor) {
+      MklDnnData<Tinput>* src, MklDnnData<Tfilter>* filter,
+      MklDnnData<Tbias>* bias, MklDnnData<Toutput>* output,
+      Tensor* filter_out_tensor) {
     CHECK_NOTNULL(filter_out_tensor);
 
     // Create reorders between user layout and MKL layout if it is needed and
@@ -1065,12 +1211,12 @@ class MklConvOp : public OpKernel {
     // Create convolution primitive and add it to net.
     std::vector<primitive> net;
     if (bias) {
-      CHECK_EQ(biasEnabled, true);
+      DCHECK(biasEnabled);
       net.push_back(convolution_forward(conv_prim_desc, src->GetOpMem(),
                                         filter->GetOpMem(), bias->GetOpMem(),
                                         output->GetOpMem()));
     } else {
-      CHECK_EQ(biasEnabled, false);
+      DCHECK(!biasEnabled);
       net.push_back(convolution_forward(conv_prim_desc, src->GetOpMem(),
                                         filter->GetOpMem(),
                                         output->GetOpMem()));
@@ -1080,24 +1226,581 @@ class MklConvOp : public OpKernel {
   }
 };
 
-#endif
+// We create new class for each verison of Quantized Convolution and inherit
+// from the FP32 version of the base class
+template <typename Device, typename Tbias, typename Toutput,
+          typename Ttemp_output, bool biasEnabled>
+class MklQuantizedConv2DOp
+    : public MklConvOp<Device, quint8, qint8, Tbias, Toutput, Ttemp_output,
+                       biasEnabled> {
+ public:
+  virtual ~MklQuantizedConv2DOp() {
+    if (this->input_bias_ != nullptr) {
+      delete this->input_bias_;
+      input_bias_ = nullptr;
+    }
+
+    if (this->scaled_bias_ != nullptr) {
+      delete this->scaled_bias_;
+      scaled_bias_ = nullptr;
+    }
+  }
+
+  explicit MklQuantizedConv2DOp(OpKernelConstruction* context)
+      : MklConvOp<Device, quint8, qint8, Tbias, Toutput, Ttemp_output,
+                  biasEnabled>(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    // Compute int32 output tensor
+    MklConvOp<Device, quint8, qint8, Tbias, Toutput, Ttemp_output,
+              biasEnabled>::Compute(context);
+
+    // Compute additional outputs: min/max scalars.
+    int bias_index_offset;
+    bias_index_offset = biasEnabled ? 1 : 0;
+
+    const float min_input =
+        context->input(2 + bias_index_offset).flat<float>()(0);
+    const float max_input =
+        context->input(3 + bias_index_offset).flat<float>()(0);
+    const float min_filter =
+        context->input(4 + bias_index_offset).flat<float>()(0);
+    const float max_filter =
+        context->input(5 + bias_index_offset).flat<float>()(0);
+
+    float min_output_value;
+    float max_output_value;
+    if (std::is_same<Toutput, quint8>::value ||
+        std::is_same<Toutput, qint8>::value) {
+      // This is the case the convolution and requantization are fused.
+      // min_freezed_output and max_freezed_output are the actual range
+      // for the output
+      min_output_value = context->input(6 + bias_index_offset).flat<float>()(0);
+      max_output_value = context->input(7 + bias_index_offset).flat<float>()(0);
+    } else {
+      MklQuantizationRangeForMultiplication<quint8, qint8, qint32>(
+          min_input, max_input, min_filter, max_filter, &min_output_value,
+          &max_output_value);
+    }
+
+    Tensor* output_min = nullptr;
+    Tensor* output_max = nullptr;
+    MklDnnShape output_min_mkl_shape, output_max_mkl_shape;
+    output_min_mkl_shape.SetMklTensor(false);
+    output_max_mkl_shape.SetMklTensor(false);
+    AllocateOutputSetMklShape(context, 1, &output_min, {},
+                              output_min_mkl_shape);
+    AllocateOutputSetMklShape(context, 2, &output_max, {},
+                              output_max_mkl_shape);
+    output_min->flat<float>()(0) = min_output_value;
+    output_max->flat<float>()(0) = max_output_value;
+  }
+
+ protected:
+  void ExtendConvFwdParams(OpKernelContext* context,
+                           MklConvFwdParams& params) override {
+    MklConvOp<Device, quint8, qint8, Tbias, Toutput, Ttemp_output,
+              biasEnabled>::ExtendConvFwdParams(context, params);
+
+    // When the output type is quint8, the output data id requantized
+    // into quint8. A post_op "output_scale" is added to do the conversion.
+    if (std::is_same<Toutput, quint8>::value ||
+        std::is_same<Toutput, qint8>::value) {
+      int bias_index_offset;
+      bias_index_offset = biasEnabled ? 1 : 0;
+
+      const float min_input =
+          context->input(2 + bias_index_offset).flat<float>()(0);
+      const float max_input =
+          context->input(3 + bias_index_offset).flat<float>()(0);
+      const float min_filter =
+          context->input(4 + bias_index_offset).flat<float>()(0);
+      const float max_filter =
+          context->input(5 + bias_index_offset).flat<float>()(0);
+      const float min_freezed_output =
+          context->input(6 + bias_index_offset).flat<float>()(0);
+      const float max_freezed_output =
+          context->input(7 + bias_index_offset).flat<float>()(0);
+
+      float min_output_value;
+      float max_output_value;
+      MklQuantizationRangeForMultiplication<quint8, qint8, qint32>(
+          min_input, max_input, min_filter, max_filter, &min_output_value,
+          &max_output_value);
+      float scale_int32 =
+          std::max(std::abs(min_output_value), std::abs(max_output_value));
+      float scale_eightbit =
+          std::max(std::abs(min_freezed_output), std::abs(max_freezed_output));
+      float scale = 1.0;
+      if (std::is_same<Toutput, quint8>::value)
+        scale = scale_int32 / scale_eightbit / static_cast<float>(1 << 23);
+      else
+        scale = scale_int32 / scale_eightbit / static_cast<float>(1 << 24);
+
+      std::vector<float> output_scale;
+      output_scale.push_back(scale);
+      params.post_op_params.push_back({"output_scale", output_scale});
+    }
+  }
+
+  Tbias* GetBiasHandle(
+      OpKernelContext* context,
+      std::shared_ptr<mkldnn::convolution_forward::primitive_desc>& conv_fwd_pd,
+      const Tensor& bias_tensor) override {
+    int bias_index_offset;
+    bias_index_offset = biasEnabled ? 1 : 0;
+
+    const float min_input =
+        context->input(2 + bias_index_offset).flat<float>()(0);
+    const float max_input =
+        context->input(3 + bias_index_offset).flat<float>()(0);
+    const float min_filter =
+        context->input(4 + bias_index_offset).flat<float>()(0);
+    const float max_filter =
+        context->input(5 + bias_index_offset).flat<float>()(0);
+
+    std::vector<mkldnn::primitive> net;
+    if (biasEnabled) {
+      if (std::is_same<Tbias, qint32>::value) {
+        return static_cast<Tbias*>(
+            const_cast<Tbias*>(bias_tensor.flat<Tbias>().data()));
+      }
+      // If bias is enabled and requantization is not fused, scale the
+      // bias to be consistent with quantized-input and quantized-filter.
+      float bias_scale = 255.0 * 127.0 /
+                         (std::max(std::abs(max_input), std::abs(min_input)) *
+                          std::max(std::abs(max_filter), std::abs(min_filter)));
+      std::vector<float> scales;
+      scales.push_back(bias_scale);
+      mkldnn::primitive_attr bias_attr;
+      bias_attr.set_output_scales(0, scales);
+
+      void* bias_buf = static_cast<void*>(
+          const_cast<Tbias*>(bias_tensor.flat<Tbias>().data()));
+      input_bias_ = new memory(conv_fwd_pd->bias_primitive_desc(), bias_buf);
+      scaled_bias_ = new memory(conv_fwd_pd->bias_primitive_desc());
+      auto reorder_desc = mkldnn::reorder::primitive_desc(
+          input_bias_->get_primitive_desc(), scaled_bias_->get_primitive_desc(),
+          bias_attr);
+      net.push_back(mkldnn::reorder(reorder_desc, *input_bias_, *scaled_bias_));
+      stream(stream::kind::eager).submit(net).wait();
+      return reinterpret_cast<Tbias*>(scaled_bias_->get_data_handle());
+    } else {
+      return nullptr;
+    }
+  }
+
+  memory* input_bias_ = nullptr;
+  memory* scaled_bias_ = nullptr;
+};
+
+template <typename Device, typename Tbias, typename Toutput,
+          typename Ttemp_output, bool biasEnabled>
+class MklQuantizedConv2DReluOp
+    : public MklQuantizedConv2DOp<Device, Tbias, Toutput, Ttemp_output,
+                                  biasEnabled> {
+ public:
+  virtual ~MklQuantizedConv2DReluOp() {}
+
+  explicit MklQuantizedConv2DReluOp(OpKernelConstruction* context)
+      : MklQuantizedConv2DOp<Device, Tbias, Toutput, Ttemp_output, biasEnabled>(
+            context) {}
+
+ protected:
+  void ExtendConvFwdParams(OpKernelContext* context,
+                           MklConvFwdParams& params) override {
+    MklQuantizedConv2DOp<Device, Tbias, Toutput, Ttemp_output,
+                         biasEnabled>::ExtendConvFwdParams(context, params);
+    params.post_op_params.push_back({"relu", {1.0, 0.0, 0.0}});
+  }
+};
+
+template <typename Device, typename Tbias, typename Toutput,
+          typename Ttemp_output, bool biasEnabled>
+class MklQuantizedConv2DSumReluOp
+    : public MklQuantizedConv2DOp<Device, Tbias, Toutput, Ttemp_output,
+                                  biasEnabled> {
+ public:
+  virtual ~MklQuantizedConv2DSumReluOp() {
+    if (this->summand_ != nullptr) {
+      delete this->summand_;
+      summand_ = nullptr;
+    }
+
+    if (this->dst_ != nullptr) {
+      delete this->dst_;
+      dst_ = nullptr;
+    }
+  }
+
+  explicit MklQuantizedConv2DSumReluOp(OpKernelConstruction* context)
+      : MklQuantizedConv2DOp<Device, Tbias, Toutput, Ttemp_output, biasEnabled>(
+            context) {}
+
+ protected:
+  void ExtendConvFwdParams(OpKernelContext* context,
+                           MklConvFwdParams& params) override {
+    MklQuantizedConv2DOp<Device, Tbias, Toutput, Ttemp_output,
+                         biasEnabled>::ExtendConvFwdParams(context, params);
+    // Calculate the scale (beta in mkldnn api term) for sum
+    if (std::is_same<Toutput, quint8>::value) {
+      int summand_idx = context->num_inputs() / 2 - 1 - 2;
+      DataType summand_type = this->input_type(summand_idx);
+      bool summand_condition =
+          (summand_type == DT_QINT8) || (summand_type == DT_QUINT8);
+      CHECK((summand_condition));
+      int bias_index_offset = biasEnabled ? 1 : 0;
+      const float min_freezed_output =
+          context->input(6 + bias_index_offset).flat<float>()(0);
+      const float max_freezed_output =
+          context->input(7 + bias_index_offset).flat<float>()(0);
+      const float min_freezed_summand =
+          context->input(9 + bias_index_offset).flat<float>()(0);
+      const float max_freezed_summand =
+          context->input(10 + bias_index_offset).flat<float>()(0);
+
+      float scale_output =
+          std::max(std::abs(min_freezed_output), std::abs(max_freezed_output));
+      float scale_summand = std::max(std::abs(min_freezed_summand),
+                                     std::abs(max_freezed_summand));
+      if (summand_type == DT_QUINT8)
+        params.post_op_params.push_back(
+            {"sum", {scale_summand / scale_output}});
+      else
+        params.post_op_params.push_back(
+            {"sum", {2.0 * scale_summand / scale_output}});
+    } else {
+      params.post_op_params.push_back({"sum", {1.0}});
+    }
+    params.post_op_params.push_back({"relu", {1.0, 0.0, 0.0}});
+  }
+
+  // Allocate output tensor.
+  void AllocateOutputTensor(
+      OpKernelContext* context,
+      const convolution_forward::primitive_desc& conv_prim_desc,
+      const memory::dims& output_dims_mkl_order,
+      memory::format output_tf_format, Tensor** output_tensor) override {
+    int summand_idx = context->num_inputs() / 2 - 1;
+    float reorder_sum_scale = 1.0;
+    if (std::is_same<Toutput, quint8>::value) {
+      summand_idx -= 2;
+      DataType summand_type = this->input_type(summand_idx);
+      bool summand_condition =
+          (summand_type == DT_QINT8) || (summand_type == DT_QUINT8);
+      CHECK((summand_condition));
+      Tensor& summand = const_cast<Tensor&>(MklGetInput(context, summand_idx));
+      MklDnnShape summand_mkl_shape;
+      GetMklShape(context, summand_idx, &summand_mkl_shape);
+      auto dst_md = summand_mkl_shape.GetMklLayout();
+      if (summand_mkl_shape.IsMklTensor()) {
+        if (summand_type == DT_QINT8) {
+          summand.UnsafeCopyFromInternal(summand, DT_QUINT8, summand.shape());
+          dst_md.data.data_type =
+              static_cast<mkldnn_data_type_t>(MklDnnType<Toutput>());
+          summand_mkl_shape.SetMklLayout(&dst_md);
+          summand_mkl_shape.SetElemType(MklDnnType<Toutput>());
+        }
+        ForwardMklTensorInToOutWithMklShape(context, summand_idx, 0,
+                                            summand_mkl_shape);
+        *output_tensor = const_cast<Tensor*>(&summand);
+        return;
+      } else {
+        TF_CHECK_OK(Status(error::Code::FAILED_PRECONDITION,
+                           "Current fusion is not successful."));
+      }
+    }
+    // TODO(mdfaijul): Add cleaner code for non-mkl tensor
+    MklConvOp<Device, quint8, qint8, Tbias, Toutput, Ttemp_output,
+              biasEnabled>::AllocateOutputTensor(context, conv_prim_desc,
+                                                 output_dims_mkl_order,
+                                                 output_tf_format,
+                                                 output_tensor);
+    const Tensor& summand = MklGetInput(context, summand_idx);
+    if (summand.dtype() != DT_FLOAT)
+      TF_CHECK_OK(Status(error::Code::FAILED_PRECONDITION,
+                         "Current fusion requires summand to be float"));
+    MklDnnShape summand_mkl_shape;
+    GetMklShape(context, summand_idx, &summand_mkl_shape);
+    // We need to compute scale for the summand
+    int bias_index_offset = biasEnabled ? 1 : 0;
+    const float min_input =
+        context->input(2 + bias_index_offset).flat<float>()(0);
+    const float max_input =
+        context->input(3 + bias_index_offset).flat<float>()(0);
+    const float min_filter =
+        context->input(4 + bias_index_offset).flat<float>()(0);
+    const float max_filter =
+        context->input(5 + bias_index_offset).flat<float>()(0);
+
+    reorder_sum_scale = 255.0 * 127.0 /
+                        (std::max(std::abs(max_input), std::abs(min_input)) *
+                         std::max(std::abs(max_filter), std::abs(min_filter)));
+    std::vector<float> scales;
+    scales.push_back(reorder_sum_scale);
+    mkldnn::primitive_attr reorder_attr;
+    reorder_attr.set_output_scales(0, scales);
+
+    auto summand_md =
+        summand_mkl_shape.IsMklTensor()
+            ? summand_mkl_shape.GetMklLayout()
+            : memory::desc(output_dims_mkl_order, MklDnnType<Tbias>(),
+                           memory::format::nhwc);
+    auto summand_pd = memory::primitive_desc(summand_md, this->cpu_engine_);
+    void* summand_buf =
+        static_cast<void*>(const_cast<Tbias*>(summand.flat<Tbias>().data()));
+    void* dst_buf =
+        static_cast<void*>((*output_tensor)->flat<Ttemp_output>().data());
+    summand_ = new memory(summand_pd, summand_buf);
+    dst_ = new memory(conv_prim_desc.dst_primitive_desc(), dst_buf);
+    auto reorder_desc = mkldnn::reorder::primitive_desc(
+        summand_pd, conv_prim_desc.dst_primitive_desc(), reorder_attr);
+
+    std::vector<mkldnn::primitive> net;
+    net.push_back(mkldnn::reorder(reorder_desc, *summand_, *dst_));
+    stream(stream::kind::eager).submit(net).wait();
+  }
+
+  memory* summand_ = nullptr;
+  memory* dst_ = nullptr;
+};
+
+// INT8 kernel registration
+// Register NoOp kernel for QunatizedConv2D for qint8 filter
+REGISTER_KERNEL_BUILDER(Name("QuantizedConv2D")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<quint8>("Tinput")
+                            .TypeConstraint<qint8>("Tfilter")
+                            .TypeConstraint<qint32>("out_type"),
+                        NoOp);
+
+REGISTER_KERNEL_BUILDER(Name("QuantizedConv2DAndRequantize")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<quint8>("Tinput")
+                            .TypeConstraint<qint8>("Tfilter")
+                            .TypeConstraint<qint8>("out_type"),
+                        NoOp);
+
+// Register a templatized implementation of MklQuntizedConv2D.
+REGISTER_KERNEL_BUILDER(
+    Name("_MklQuantizedConv2D")
+        .Device(DEVICE_CPU)
+        .TypeConstraint<quint8>("Tinput")
+        .TypeConstraint<qint8>("Tfilter")
+        .TypeConstraint<qint32>("out_type")
+        .Label(mkl_op_registry::kMklQuantizedOpLabel),
+    MklQuantizedConv2DOp<CPUDevice, float, qint32, qint32, false>);
+
+REGISTER_KERNEL_BUILDER(
+    Name("_MklQuantizedConv2DAndRequantize")
+        .Device(DEVICE_CPU)
+        .TypeConstraint<quint8>("Tinput")
+        .TypeConstraint<qint8>("Tfilter")
+        .TypeConstraint<qint8>("out_type")
+        .Label(mkl_op_registry::kMklQuantizedOpLabel),
+    MklQuantizedConv2DOp<CPUDevice, qint32, qint8, qint8, false>);
+
+// Register NoOp kernel for QuantizedConv2DWithBias to get a python interface.
+// This kernel will be replaced by an MKL kernel during graph
+// optimization pass.
+REGISTER_KERNEL_BUILDER(Name("QuantizedConv2DWithBias")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<quint8>("Tinput")
+                            .TypeConstraint<qint8>("Tfilter")
+                            .TypeConstraint<qint32>("out_type"),
+                        NoOp);
+
+REGISTER_KERNEL_BUILDER(Name("QuantizedConv2DWithBiasAndRequantize")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<quint8>("Tinput")
+                            .TypeConstraint<qint8>("Tfilter")
+                            .TypeConstraint<qint8>("out_type"),
+                        NoOp);
+
+// Register a templatized implementation MklQuantizedConv2DWithBias.
+REGISTER_KERNEL_BUILDER(
+    Name("_MklQuantizedConv2DWithBias")
+        .Device(DEVICE_CPU)
+        .TypeConstraint<quint8>("Tinput")
+        .TypeConstraint<qint8>("Tfilter")
+        .TypeConstraint<qint32>("out_type")
+        .Label(mkl_op_registry::kMklQuantizedOpLabel),
+    MklQuantizedConv2DOp<CPUDevice, float, qint32, qint32, true>);
+
+REGISTER_KERNEL_BUILDER(
+    Name("_MklQuantizedConv2DWithBiasAndRequantize")
+        .Device(DEVICE_CPU)
+        .TypeConstraint<quint8>("Tinput")
+        .TypeConstraint<qint8>("Tfilter")
+        .TypeConstraint<qint32>("Tbias")
+        .TypeConstraint<qint8>("out_type")
+        .Label(mkl_op_registry::kMklQuantizedOpLabel),
+    MklQuantizedConv2DOp<CPUDevice, qint32, qint8, qint8, true>);
+REGISTER_KERNEL_BUILDER(
+    Name("_MklQuantizedConv2DWithBiasAndRequantize")
+        .Device(DEVICE_CPU)
+        .TypeConstraint<quint8>("Tinput")
+        .TypeConstraint<qint8>("Tfilter")
+        .TypeConstraint<float>("Tbias")
+        .TypeConstraint<qint8>("out_type")
+        .Label(mkl_op_registry::kMklQuantizedOpLabel),
+    MklQuantizedConv2DOp<CPUDevice, float, qint8, qint8, true>);
+
+// Register NoOp kernel for QuantizedConv2DAndRelu to get a python interface.
+// This kernel will be replaced by an MKL kernel during graph-optimization pass.
+REGISTER_KERNEL_BUILDER(Name("QuantizedConv2DAndRelu")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<quint8>("Tinput")
+                            .TypeConstraint<qint8>("Tfilter")
+                            .TypeConstraint<qint32>("out_type"),
+                        NoOp);
+
+REGISTER_KERNEL_BUILDER(Name("QuantizedConv2DAndReluAndRequantize")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<quint8>("Tinput")
+                            .TypeConstraint<qint8>("Tfilter")
+                            .TypeConstraint<quint8>("out_type"),
+                        NoOp);
+
+// Register a templatized implementation of MklQuantizedConv2DAndRelu.
+REGISTER_KERNEL_BUILDER(
+    Name("_MklQuantizedConv2DAndRelu")
+        .Device(DEVICE_CPU)
+        .TypeConstraint<quint8>("Tinput")
+        .TypeConstraint<qint8>("Tfilter")
+        .TypeConstraint<qint32>("out_type")
+        .Label(mkl_op_registry::kMklQuantizedOpLabel),
+    MklQuantizedConv2DReluOp<CPUDevice, float, qint32, qint32, false>);
+
+REGISTER_KERNEL_BUILDER(
+    Name("_MklQuantizedConv2DAndReluAndRequantize")
+        .Device(DEVICE_CPU)
+        .TypeConstraint<quint8>("Tinput")
+        .TypeConstraint<qint8>("Tfilter")
+        .TypeConstraint<quint8>("out_type")
+        .Label(mkl_op_registry::kMklQuantizedOpLabel),
+    MklQuantizedConv2DReluOp<CPUDevice, qint32, quint8, quint8, false>);
+
+// Register NoOp kernel for QuantizedConv2DWithBiasAndRelu to get a python
+// interface.
+// This kernel will be replaced by an MKL kernel during graph-optimization pass.
+REGISTER_KERNEL_BUILDER(Name("QuantizedConv2DWithBiasAndRelu")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<quint8>("Tinput")
+                            .TypeConstraint<qint8>("Tfilter")
+                            .TypeConstraint<qint32>("out_type"),
+                        NoOp);
+
+// Register NoOp kernel for QuantizedConv2DWithBiasAndReluAndRequantize
+// to get a python interface.
+// This kernel will be replaced by an MKL kernel during graph-optimization pass.
+REGISTER_KERNEL_BUILDER(Name("QuantizedConv2DWithBiasAndReluAndRequantize")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<quint8>("Tinput")
+                            .TypeConstraint<qint8>("Tfilter")
+                            .TypeConstraint<quint8>("out_type"),
+                        NoOp);
+
+// Register a templatized implementation of MklQuantizedConv2DWithBiasAndRelu.
+REGISTER_KERNEL_BUILDER(
+    Name("_MklQuantizedConv2DWithBiasAndRelu")
+        .Device(DEVICE_CPU)
+        .TypeConstraint<quint8>("Tinput")
+        .TypeConstraint<qint8>("Tfilter")
+        .TypeConstraint<qint32>("out_type")
+        .Label(mkl_op_registry::kMklQuantizedOpLabel),
+    MklQuantizedConv2DReluOp<CPUDevice, float, qint32, qint32, true>);
+
+// Register a templatized implementation of
+// MklQuantizedConv2DWithBiasAndReluAndRequantize.
+REGISTER_KERNEL_BUILDER(
+    Name("_MklQuantizedConv2DWithBiasAndReluAndRequantize")
+        .Device(DEVICE_CPU)
+        .TypeConstraint<quint8>("Tinput")
+        .TypeConstraint<qint8>("Tfilter")
+        .TypeConstraint<float>("Tbias")
+        .TypeConstraint<quint8>("out_type")
+        .Label(mkl_op_registry::kMklQuantizedOpLabel),
+    MklQuantizedConv2DReluOp<CPUDevice, float, quint8, quint8, true>);
+REGISTER_KERNEL_BUILDER(
+    Name("_MklQuantizedConv2DWithBiasAndReluAndRequantize")
+        .Device(DEVICE_CPU)
+        .TypeConstraint<quint8>("Tinput")
+        .TypeConstraint<qint8>("Tfilter")
+        .TypeConstraint<qint32>("Tbias")
+        .TypeConstraint<quint8>("out_type")
+        .Label(mkl_op_registry::kMklQuantizedOpLabel),
+    MklQuantizedConv2DReluOp<CPUDevice, qint32, quint8, quint8, true>);
+
+// Register NoOp kernel for QuantizedConv2DWithBiasSumAndRelu to get a python
+// interface.
+// This kernel will be replaced by an MKL kernel during graph-optimization pass.
+REGISTER_KERNEL_BUILDER(Name("QuantizedConv2DWithBiasSumAndRelu")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<quint8>("Tinput")
+                            .TypeConstraint<qint8>("Tfilter")
+                            .TypeConstraint<qint32>("out_type"),
+                        NoOp);
+
+REGISTER_KERNEL_BUILDER(Name("QuantizedConv2DWithBiasSumAndReluAndRequantize")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<quint8>("Tinput")
+                            .TypeConstraint<qint8>("Tfilter")
+                            .TypeConstraint<quint8>("out_type"),
+                        NoOp);
+REGISTER_KERNEL_BUILDER(
+    Name("QuantizedConv2DWithBiasSignedSumAndReluAndRequantize")
+        .Device(DEVICE_CPU)
+        .TypeConstraint<quint8>("Tinput")
+        .TypeConstraint<qint8>("Tfilter")
+        .TypeConstraint<quint8>("out_type"),
+    NoOp);
+// Register a templatized implementation of MklQuantizedConv2DWithBiasAndRelu.
+REGISTER_KERNEL_BUILDER(
+    Name("_MklQuantizedConv2DWithBiasSumAndRelu")
+        .Device(DEVICE_CPU)
+        .TypeConstraint<quint8>("Tinput")
+        .TypeConstraint<qint8>("Tfilter")
+        .TypeConstraint<qint32>("out_type")
+        .Label(mkl_op_registry::kMklQuantizedOpLabel),
+    MklQuantizedConv2DSumReluOp<CPUDevice, float, qint32, qint32, true>);
+
+REGISTER_KERNEL_BUILDER(
+    Name("_MklQuantizedConv2DWithBiasSumAndReluAndRequantize")
+        .Device(DEVICE_CPU)
+        .TypeConstraint<quint8>("Tinput")
+        .TypeConstraint<qint8>("Tfilter")
+        .TypeConstraint<quint8>("out_type")
+        .Label(mkl_op_registry::kMklQuantizedOpLabel),
+    MklQuantizedConv2DSumReluOp<CPUDevice, qint32, quint8, quint8, true>);
+REGISTER_KERNEL_BUILDER(
+    Name("_MklQuantizedConv2DWithBiasSignedSumAndReluAndRequantize")
+        .Device(DEVICE_CPU)
+        .TypeConstraint<quint8>("Tinput")
+        .TypeConstraint<qint8>("Tfilter")
+        .TypeConstraint<quint8>("out_type")
+        .Label(mkl_op_registry::kMklQuantizedOpLabel),
+    MklQuantizedConv2DSumReluOp<CPUDevice, qint32, quint8, qint8, true>);
+#endif  // INTEL_MKL_ML
 
 // Register 2D operations
-#define REGISTER_MKL_CPU_2D(T)                                      \
-  REGISTER_KERNEL_BUILDER(Name("_MklConv2D")                        \
-                              .Device(DEVICE_CPU)                   \
-                              .TypeConstraint<T>("T")               \
-                              .Label(mkl_op_registry::kMklOpLabel), \
-                          MklConvOp<CPUDevice, T, false>);          \
-  REGISTER_KERNEL_BUILDER(Name("_MklConv2DWithBias")                \
-                              .Device(DEVICE_CPU)                   \
-                              .TypeConstraint<T>("T")               \
-                              .Label(mkl_op_registry::kMklOpLabel), \
-                          MklConvOp<CPUDevice, T, true>);           \
-  REGISTER_KERNEL_BUILDER(Name("__MklDummyConv2DWithBias")          \
-                              .Device(DEVICE_CPU)                   \
-                              .TypeConstraint<T>("T")               \
-                              .Label(mkl_op_registry::kMklOpLabel), \
+#define REGISTER_MKL_CPU_2D(T)                                         \
+  REGISTER_KERNEL_BUILDER(                                             \
+      Name("_MklConv2D")                                               \
+          .Device(DEVICE_CPU)                                          \
+          .TypeConstraint<float>("T")                                  \
+          .Label(mkl_op_registry::kMklOpLabel),                        \
+      MklConvOp<CPUDevice, float, float, float, float, float, false>); \
+  REGISTER_KERNEL_BUILDER(                                             \
+      Name("_MklConv2DWithBias")                                       \
+          .Device(DEVICE_CPU)                                          \
+          .TypeConstraint<float>("T")                                  \
+          .Label(mkl_op_registry::kMklOpLabel),                        \
+      MklConvOp<CPUDevice, float, float, float, float, float, true>);  \
+  REGISTER_KERNEL_BUILDER(Name("__MklDummyConv2DWithBias")             \
+                              .Device(DEVICE_CPU)                      \
+                              .TypeConstraint<T>("T")                  \
+                              .Label(mkl_op_registry::kMklOpLabel),    \
                           MklDummyOp<CPUDevice, T>);
 
 TF_CALL_float(REGISTER_MKL_CPU_2D);
@@ -1108,7 +1811,7 @@ TF_CALL_float(REGISTER_MKL_CPU_2D);
                               .Device(DEVICE_CPU)                   \
                               .TypeConstraint<T>("T")               \
                               .Label(mkl_op_registry::kMklOpLabel), \
-                          MklConvOp<CPUDevice, T, false>);
+                          MklConvOp<CPUDevice, T, T, T, T, T, false>);
 TF_CALL_float(REGISTER_MKL_CPU_3D);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/mkl_quantized_conv_ops.h b/tensorflow/core/kernels/mkl_quantized_conv_ops.h
new file mode 100644
index 00000000000..10825f69625
--- /dev/null
+++ b/tensorflow/core/kernels/mkl_quantized_conv_ops.h
@@ -0,0 +1,55 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_MKL_QUANTIZED_CONV_OPS_H_
+#define TENSORFLOW_CORE_KERNELS_MKL_QUANTIZED_CONV_OPS_H_
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/tensor.h"
+
+#ifdef INTEL_MKL
+
+namespace tensorflow {
+template <class T>
+float MklFloatForOneQuantizedLevel(float range_min, float range_max) {
+  const int64 highest = static_cast<int64>(Eigen::NumTraits<T>::highest());
+  const int64 lowest = static_cast<int64>(Eigen::NumTraits<T>::lowest());
+  const float float_for_one_quantized_level =
+      (range_max - range_min) / (highest - lowest);
+  return float_for_one_quantized_level;
+}
+
+template <class T1, class T2, class T3>
+void MklQuantizationRangeForMultiplication(float min_a, float max_a,
+                                           float min_b, float max_b,
+                                           float* min_c, float* max_c) {
+  const float a_float_for_one_quant_level =
+      MklFloatForOneQuantizedLevel<T1>(min_a, max_a);
+  const float b_float_for_one_quant_level =
+      MklFloatForOneQuantizedLevel<T2>(min_b, max_b);
+
+  const int64 c_highest = static_cast<int64>(Eigen::NumTraits<T3>::highest());
+  const int64 c_lowest = static_cast<int64>(Eigen::NumTraits<T3>::lowest());
+  const float c_float_for_one_quant_level =
+      a_float_for_one_quant_level * b_float_for_one_quant_level;
+
+  *min_c = c_float_for_one_quant_level * c_lowest;
+  *max_c = c_float_for_one_quant_level * c_highest;
+}
+}  // namespace tensorflow
+
+#endif  // INTEL_MKL
+
+#endif  // TENSORFLOW_CORE_KERNELS_MKL_QUANTIZED_CONV_OPS_H_
diff --git a/tensorflow/core/kernels/random_op_gpu.cu.cc b/tensorflow/core/kernels/random_op_gpu.cu.cc
index 3393b39faf4..edb2b10e3d6 100644
--- a/tensorflow/core/kernels/random_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/random_op_gpu.cu.cc
@@ -217,9 +217,9 @@ void FillPhiloxRandom<GPUDevice, Distribution>::operator()(
     OpKernelContext*, const GPUDevice& d, random::PhiloxRandom gen,
     typename Distribution::ResultElementType* data, int64 size,
     Distribution dist) {
-  const int32 block_size = d.maxCudaThreadsPerBlock();
+  const int32 block_size = d.maxGpuThreadsPerBlock();
   const int32 num_blocks =
-      (d.getNumCudaMultiProcessors() * d.maxCudaThreadsPerMultiProcessor()) /
+      (d.getNumGpuMultiProcessors() * d.maxGpuThreadsPerMultiProcessor()) /
       block_size;
 
   FillPhiloxRandomKernelLaunch<Distribution>
diff --git a/tensorflow/core/kernels/reduction_gpu_kernels.cu.h b/tensorflow/core/kernels/reduction_gpu_kernels.cu.h
index f5644d0da4c..e9cf36c62b9 100644
--- a/tensorflow/core/kernels/reduction_gpu_kernels.cu.h
+++ b/tensorflow/core/kernels/reduction_gpu_kernels.cu.h
@@ -218,7 +218,11 @@ __global__ void RowReduceKernel(
     T in, outT out, int num_rows, int num_cols, Op op,
     typename std::iterator_traits<T>::value_type initVal) {
   typedef typename std::iterator_traits<T>::value_type value_type;
-  const int row = (blockIdx.x * blockDim.x + threadIdx.x) / 32;
+  // Defensive index computation to avoid integer overflow.
+  assert(blockDim.x % 32 == 0);
+  int warps_per_block = blockDim.x / 32;
+  int warp_index = threadIdx.x / 32;
+  const int row = blockIdx.x * warps_per_block + warp_index;
   const int lane = threadIdx.x % 32;
 
   if (num_cols == 1) {
@@ -526,27 +530,27 @@ void LaunchScalarReduction(OpKernelContext* ctx, OUT_T out, IN_T in,
         init);
     return;
   }
-  std::size_t temp_storage_bytes = 0;
 
-  Tensor temp_storage;
-  // written as a loop because it reduces clutter
-  // first pass allocates memory, second launches kernel(s)
-  for (int i = 0; i < 2; ++i) {
-    auto success = cub::DeviceReduce::Reduce(
-        i == 0 ? nullptr : temp_storage.flat<int8_t>().data(),
-        temp_storage_bytes, in, out, in_size, op, init, cu_stream);
+  size_t temp_storage_bytes = 0;
+  auto reduce = [&](void* temp_storage_ptr) {
+    auto success =
+        cub::DeviceReduce::Reduce(temp_storage_ptr, temp_storage_bytes, in, out,
+                                  in_size, op, init, cu_stream);
 
     OP_REQUIRES(
         ctx, success == 0,
         errors::Internal("CUB reduce error", cudaGetErrorString(success)));
+  };
 
-    if (i == 0)
-      OP_REQUIRES_OK(
-          ctx,
-          ctx->allocate_temp(
-              DT_INT8, TensorShape({static_cast<int64>(temp_storage_bytes)}),
-              &temp_storage));
-  }
+  reduce(nullptr);  // Get required amount of temp storage.
+
+  Tensor temp_storage;
+  OP_REQUIRES_OK(
+      ctx, ctx->allocate_temp(
+               DT_INT8, TensorShape({static_cast<int64>(temp_storage_bytes)}),
+               &temp_storage));
+
+  reduce(temp_storage.flat<int8_t>().data());  // Do reduction.
 }
 
 template <typename T, typename Op, typename OUT_T, typename IN_T>
@@ -569,25 +573,26 @@ void LaunchRowReduction(OpKernelContext* ctx, OUT_T out, IN_T in, int num_rows,
   cub::TransformInputIterator<int, RowOffset, cub::CountingInputIterator<int>>
       transform_iter(counting_iter, row_offset_op);
 
-  std::size_t temp_storage_bytes = 0;
-  Tensor temp_storage;
-  for (int i = 0; i < 2; ++i) {
+  size_t temp_storage_bytes = 0;
+  auto reduce = [&](void* temp_storage_ptr) {
     auto success = cub::DeviceSegmentedReduce::Reduce(
-        i == 0 ? nullptr : temp_storage.flat<int8_t>().data(),
-        temp_storage_bytes, in, out, num_rows, transform_iter,
+        temp_storage_ptr, temp_storage_bytes, in, out, num_rows, transform_iter,
         transform_iter + 1, op, init, cu_stream);
 
     OP_REQUIRES(ctx, success == 0,
                 errors::Internal("CUB segmented reduce error",
                                  cudaGetErrorString(success)));
+  };
 
-    if (i == 0)
-      OP_REQUIRES_OK(
-          ctx,
-          ctx->allocate_temp(
-              DT_INT8, TensorShape({static_cast<int64>(temp_storage_bytes)}),
-              &temp_storage));
-  }
+  reduce(nullptr);  // Get required amount of temp storage.
+
+  Tensor temp_storage;
+  OP_REQUIRES_OK(
+      ctx, ctx->allocate_temp(
+               DT_INT8, TensorShape({static_cast<int64>(temp_storage_bytes)}),
+               &temp_storage));
+
+  reduce(temp_storage.flat<int8_t>().data());  // Do reduction.
 }
 
 template <typename T, typename Op, typename OUT_T, typename IN_T>
@@ -720,25 +725,25 @@ void Launch3DXZReduction(OpKernelContext* ctx, OUT_T out, IN_T in, int extent_x,
                                                                  gather_iter);
 
   std::size_t temp_storage_bytes = 0;
-  Tensor temp_storage;
-
-  for (int i = 0; i < 2; ++i) {
+  auto reduce = [&](void* temp_storage_ptr) {
     auto success = cub::DeviceSegmentedReduce::Reduce(
-        i == 0 ? nullptr : temp_storage.flat<int8_t>().data(),
-        temp_storage_bytes, permute_iter, out, extent_y, transform_iter,
-        transform_iter + 1, op, init, cu_stream);
+        temp_storage_ptr, temp_storage_bytes, permute_iter, out, extent_y,
+        transform_iter, transform_iter + 1, op, init, cu_stream);
 
     OP_REQUIRES(ctx, success == 0,
                 errors::Internal("CUB segmented reduce error",
                                  cudaGetErrorString(success)));
+  };
 
-    if (i == 0)
-      OP_REQUIRES_OK(
-          ctx,
-          ctx->allocate_temp(
-              DT_INT8, TensorShape({static_cast<int64>(temp_storage_bytes)}),
-              &temp_storage));
-  }
+  reduce(nullptr);  // Get required amount of temp storage.
+
+  Tensor temp_storage;
+  OP_REQUIRES_OK(
+      ctx, ctx->allocate_temp(
+               DT_INT8, TensorShape({static_cast<int64>(temp_storage_bytes)}),
+               &temp_storage));
+
+  reduce(temp_storage.flat<int8_t>().data());  // Do reduction.
 }
 
 namespace reduction_op_helper {
diff --git a/tensorflow/core/kernels/sparse_matmul_op.cc b/tensorflow/core/kernels/sparse_matmul_op.cc
index 866c5dcd521..2ea7a1ed3b9 100644
--- a/tensorflow/core/kernels/sparse_matmul_op.cc
+++ b/tensorflow/core/kernels/sparse_matmul_op.cc
@@ -44,6 +44,10 @@ limitations under the License.
 #include "include/libxsmm_spmdm.h"
 #endif
 
+#if defined(TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL)
+#include "tensorflow/core/kernels/eigen_contraction_kernel.h"
+#endif
+
 namespace tensorflow {
 namespace {
 
diff --git a/tensorflow/core/kernels/stack.cc b/tensorflow/core/kernels/stack.cc
new file mode 100644
index 00000000000..5c70a2d62d3
--- /dev/null
+++ b/tensorflow/core/kernels/stack.cc
@@ -0,0 +1,339 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/stack.h"
+
+#include <limits.h>
+#include <atomic>
+#include <vector>
+
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/framework/device_base.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/refcount.h"
+#include "tensorflow/core/lib/gtl/map_util.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+class Stack : public ResourceBase {
+ public:
+  static std::atomic<int64> stack_counter;
+
+  struct TensorAndAllocation {
+    Tensor tensor;
+    AllocatorAttributes alloc_attrs;
+    bool swapped_to_cpu;
+  };
+
+  Stack(const DataType& elem_type, const string& stack_name, int max_size)
+      : elem_type_(elem_type),
+        stack_name_(stack_name),
+        max_size_(max_size),
+        closed_(false) {}
+
+  Status Push(const TensorAndAllocation& value) {
+    mutex_lock l(mu_);
+    TF_RETURN_IF_ERROR(CheckNotClosed());
+    if (max_size_ >= 0 && stack_.size() >= max_size_) {
+      return errors::InvalidArgument("Stack[", stack_name_, "] overflowed ",
+                                     "its max_size (", max_size_, ")");
+    }
+    stack_.push_back(value);
+    return Status::OK();
+  }
+
+  Status Pop(TensorAndAllocation* value) {
+    mutex_lock l(mu_);
+    TF_RETURN_IF_ERROR(CheckNotClosed());
+    if (stack_.empty()) {
+      return errors::InvalidArgument("Stack[", stack_name_,
+                                     "] is empty when calling Pop().");
+    }
+    *value = stack_.back();
+    stack_.pop_back();
+    return Status::OK();
+  }
+
+  // We don't swap the first tensor on the stack and any subsequent tensors
+  // that share the buffer with the first tensor.
+  bool IsUsefulToSwap(const Tensor& tensor) const {
+    mutex_lock l(mu_);
+    if (stack_.empty()) {
+      return false;
+    }
+    const Tensor& first = stack_.front().tensor;
+    return !tensor.SharesBufferWith(first);
+  }
+
+  void Close() {
+    mutex_lock l(mu_);
+    stack_.clear();
+    closed_ = true;
+  }
+
+  DataType ElemType() { return elem_type_; }
+
+  string DebugString() override {
+    mutex_lock l(mu_);
+    return strings::StrCat("Stack[", stack_name_, "]");
+  }
+
+  const string& stack_name() { return stack_name_; }
+
+ private:
+  friend class StackOp;
+  mutex* mu() { return &mu_; }
+
+  mutable mutex mu_;
+  DataType elem_type_;
+  const string stack_name_;
+  Tensor handle_;
+  int max_size_;
+  bool closed_ GUARDED_BY(mu_);
+  std::vector<TensorAndAllocation> stack_ GUARDED_BY(mu_);
+
+  Status CheckNotClosed() const EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+    if (closed_) {
+      return errors::InvalidArgument("Stack[", stack_name_,
+                                     "] has already been closed.");
+    }
+    return Status::OK();
+  }
+};
+
+Status GetStack(OpKernelContext* ctx, Stack** stack) {
+  if (ctx->input_dtype(0) == DT_RESOURCE) {
+    return LookupResource(ctx, HandleFromInput(ctx, 0), stack);
+  } else {
+    Tensor Tstack_handle = ctx->mutable_input(0, false);
+    if (Tstack_handle.NumElements() != 2) {
+      return errors::InvalidArgument(
+          "Stack handle must have two elements, but had shape: ",
+          Tstack_handle.shape().DebugString());
+    }
+    const string& container = Tstack_handle.flat<string>()(0);
+    const string& stack_name = Tstack_handle.flat<string>()(1);
+    string key = strings::StrCat(container, stack_name);
+    ResourceMgr* rm = ctx->resource_manager();
+    if (rm == nullptr) {
+      return errors::Internal("No resource manager.");
+    }
+    auto* step_container = ctx->step_container();
+    if (step_container == nullptr) {
+      return errors::Internal("No step container.");
+    }
+    TF_RETURN_IF_ERROR(rm->Lookup(step_container->name(), key, stack));
+    return Status::OK();
+  }
+}
+
+std::atomic<int64> Stack::stack_counter{0};
+
+// StackOp
+
+StackOp::StackOp(OpKernelConstruction* context) : OpKernel(context) {
+  OP_REQUIRES_OK(context, context->GetAttr("elem_type", &elem_type_));
+  OP_REQUIRES_OK(context, context->GetAttr("stack_name", &stack_name_));
+  if (stack_name_.empty()) stack_name_ = name();
+}
+
+void StackOp::Compute(OpKernelContext* ctx) {
+  int32 size = std::numeric_limits<int32>::max();
+  if (ctx->num_inputs() > 0) {
+    const Tensor* tensor_size;
+    OP_REQUIRES_OK(ctx, ctx->input("max_size", &tensor_size));
+
+    OP_REQUIRES(
+        ctx, TensorShapeUtils::IsScalar(tensor_size->shape()),
+        errors::InvalidArgument("Stack size must be a scalar, but had shape: ",
+                                tensor_size->shape().DebugString()));
+
+    int32 size_value = tensor_size->scalar<int32>()();
+    if (size_value >= 0) {
+      size = size_value;
+    }
+  }
+
+  static const char kContainer[] = "_stacks";
+  auto stack_id = Stack::stack_counter.fetch_add(1);
+  string stack_name = strings::StrCat(stack_name_, "_", stack_id);
+  // Store the handle in a per-step container.
+  ResourceMgr* rm = ctx->resource_manager();
+  OP_REQUIRES(ctx, rm != nullptr, errors::Internal("No resource manager."));
+  string key = strings::StrCat(kContainer, stack_name);
+  Stack* stack = new Stack(elem_type_, stack_name, size);
+  auto* step_container = ctx->step_container();
+  OP_REQUIRES(ctx, step_container != nullptr,
+              errors::Internal("No step container."));
+  OP_REQUIRES_OK(ctx, rm->Create(step_container->name(), key, stack));
+  if (IsRefType(ctx->expected_output_dtype(0))) {
+    // Create the stack handle.
+    AllocatorAttributes alloc_attr;
+    alloc_attr.set_on_host(true);
+    OP_REQUIRES_OK(ctx, ctx->allocate_temp(tensorflow::DT_STRING,
+                                           tensorflow::TensorShape({2}),
+                                           &stack->handle_, alloc_attr));
+    auto handle = stack->handle_.flat<string>();
+    handle(0) = kContainer;
+    handle(1) = std::move(stack_name);
+    ctx->set_output_ref(0, stack->mu(), &stack->handle_);
+  } else {
+    Tensor* handle;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &handle));
+    handle->flat<ResourceHandle>()(0) =
+        MakePerStepResourceHandle<Stack>(ctx, key);
+  }
+}
+
+// StackPushOp
+
+StackPushOp::StackPushOp(OpKernelConstruction* context, bool allow_swapping)
+    : AsyncOpKernel(context) {
+  if (allow_swapping) {
+    OP_REQUIRES_OK(context, context->GetAttr("swap_memory", &swap_memory_));
+  }
+}
+
+void StackPushOp::ComputeAsync(OpKernelContext* ctx, DoneCallback done) {
+  // Get the stack from the handle.
+  Stack* stack = nullptr;
+  OP_REQUIRES_OK_ASYNC(ctx, GetStack(ctx, &stack), done);
+  core::ScopedUnref unref(stack);
+
+  if (ctx->input_dtype(1) != stack->ElemType()) {
+    ctx->CtxFailure(errors::InvalidArgument("Must have type ",
+                                            stack->ElemType(), " but got ",
+                                            ctx->input_dtype(1)));
+    done();
+    return;
+  }
+
+  // Push the tensor onto the stack. Swap the tensor to CPU if instructed.
+  const Tensor& tensor = ctx->input(1);
+  AllocatorAttributes alloc_attrs = ctx->input_alloc_attr(1);
+  // For now, we use a simple heuristic for swapping: A GPU tensor is moved
+  // to CPU if the tensor has more than kCopyThreshold bytes and the GPU
+  // allocator says more than kOccupancy of the memory is in use.
+  static constexpr int kCopyThreshold = 2048;
+  static constexpr double kOccupancy = 0.7;
+  if (swap_memory_ && !alloc_attrs.on_host() &&
+      tensor.TotalBytes() > kCopyThreshold && stack->IsUsefulToSwap(tensor)) {
+    DeviceContext* device_ctxt = ctx->op_device_context();
+    auto device = static_cast<tensorflow::Device*>(ctx->device());
+    Allocator* allocator = device->GetAllocator(alloc_attrs);
+    AllocatorStats stats;
+    allocator->GetStats(&stats);
+    if (stats.bytes_in_use > (stats.bytes_limit * kOccupancy)) {
+      // Asynchronously copy the tensor from GPU to CPU memory.
+      // TODO(yuanbyu): Swap the oldest tensor first.
+      AllocatorAttributes host_alloc_attrs;
+      host_alloc_attrs.set_gpu_compatible(true);
+      host_alloc_attrs.set_on_host(true);
+      Allocator* cpu_allocator = device->GetAllocator(host_alloc_attrs);
+      Tensor* cpu_tensor =
+          new Tensor(cpu_allocator, tensor.dtype(), tensor.shape());
+      device_ctxt->CopyDeviceTensorToCPU(
+          &tensor, "StackPush", device, cpu_tensor,
+          [cpu_tensor, stack, ctx, done](const Status& s) {
+            ctx->SetStatus(s);
+            if (s.ok()) {
+              AllocatorAttributes alloc_attrs = ctx->input_alloc_attr(1);
+              ctx->SetStatus(stack->Push({*cpu_tensor, alloc_attrs, true}));
+            }
+            if (ctx->status().ok()) {
+              ctx->set_output(0, *cpu_tensor);
+            }
+            done();
+            delete cpu_tensor;
+          });
+      return;
+    }
+  }
+
+  // Execute synchronously if not swapped.
+  OP_REQUIRES_OK_ASYNC(ctx, stack->Push({tensor, alloc_attrs, false}), done);
+  ctx->set_output(0, tensor);
+  done();
+}
+
+bool StackPushOp::IsExpensive() { return false; }
+
+// StackPopOp
+
+StackPopOp::StackPopOp(OpKernelConstruction* context)
+    : AsyncOpKernel(context) {}
+
+void StackPopOp::ComputeAsync(OpKernelContext* ctx, DoneCallback done) {
+  // Get the stack from the handle.
+  Stack* stack = nullptr;
+  OP_REQUIRES_OK_ASYNC(ctx, GetStack(ctx, &stack), done);
+  core::ScopedUnref unref(stack);
+
+  // Pop the tensor. Transfer the tensor back to device if it was
+  // swapped out to CPU.
+  Stack::TensorAndAllocation value;
+  OP_REQUIRES_OK_ASYNC(ctx, stack->Pop(&value), done);
+  if (value.swapped_to_cpu) {
+    // Asynchronously copy the tensor back from CPU to GPU memory.
+    DeviceContext* device_ctxt = ctx->op_device_context();
+    Device* device = static_cast<Device*>(ctx->device());
+    Tensor* cpu_tensor = &value.tensor;
+    Allocator* gpu_allocator = device->GetAllocator(value.alloc_attrs);
+    Tensor* device_tensor =
+        new Tensor(gpu_allocator, cpu_tensor->dtype(), cpu_tensor->shape());
+    device_ctxt->CopyCPUTensorToDevice(
+        cpu_tensor, device, device_tensor,
+        [device_tensor, ctx, done](const Status& s) {
+          ctx->SetStatus(s);
+          if (s.ok()) {
+            ctx->set_output(0, *device_tensor);
+          }
+          done();
+          delete device_tensor;
+        });
+  } else {
+    // Execute synchronously if not swapped.
+    ctx->set_output(0, value.tensor);
+    done();
+  }
+}
+
+bool StackPopOp::IsExpensive() { return false; }
+
+// StackCloseOp
+
+StackCloseOp::StackCloseOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+void StackCloseOp::Compute(OpKernelContext* ctx) {
+  Stack* stack = nullptr;
+  OP_REQUIRES_OK(ctx, GetStack(ctx, &stack));
+  core::ScopedUnref unref(stack);
+  stack->Close();
+}
+
+bool StackCloseOp::IsExpensive() { return false; }
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/stack.h b/tensorflow/core/kernels/stack.h
new file mode 100644
index 00000000000..e1927e1f28f
--- /dev/null
+++ b/tensorflow/core/kernels/stack.h
@@ -0,0 +1,76 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_STACK_H_
+#define TENSORFLOW_CORE_KERNELS_STACK_H_
+
+// See docs in ../ops/data_flow_ops.cc.
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+// A per-run local stack. The stack uses a "per-step" resource manager which
+// ensures that correct garbage collection on error or successful completion.
+class StackOp : public OpKernel {
+ public:
+  explicit StackOp(OpKernelConstruction* context);
+  void Compute(OpKernelContext* ctx) override;
+
+ private:
+  DataType elem_type_;
+  string stack_name_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(StackOp);
+};
+
+class StackPushOp : public AsyncOpKernel {
+ public:
+  StackPushOp(OpKernelConstruction* context, bool allow_swapping);
+  void ComputeAsync(OpKernelContext* ctx, DoneCallback done) override;
+  bool IsExpensive() override;
+
+ private:
+  bool swap_memory_ = false;
+};
+
+// Templated helper to make it easier to register kernels with or without
+// swapping.
+template <bool allow_swapping>
+class TemplatedStackPushOp : public StackPushOp {
+ public:
+  TemplatedStackPushOp(OpKernelConstruction* context)
+      : StackPushOp(context, allow_swapping) {}
+};
+
+class StackPopOp : public AsyncOpKernel {
+ public:
+  explicit StackPopOp(OpKernelConstruction* context);
+  void ComputeAsync(OpKernelContext* ctx, DoneCallback done) override;
+  bool IsExpensive() override;
+};
+
+class StackCloseOp : public OpKernel {
+ public:
+  explicit StackCloseOp(OpKernelConstruction* context);
+  void Compute(OpKernelContext* ctx) override;
+  bool IsExpensive() override;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_STACK_H_
diff --git a/tensorflow/core/kernels/stack_ops.cc b/tensorflow/core/kernels/stack_ops.cc
index add4afafc92..df94a8818e7 100644
--- a/tensorflow/core/kernels/stack_ops.cc
+++ b/tensorflow/core/kernels/stack_ops.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 // See docs in ../ops/data_flow_ops.cc.
 
+#include "tensorflow/core/kernels/stack.h"
+
 #include <limits.h>
 #include <atomic>
 #include <vector>
@@ -38,191 +40,6 @@ limitations under the License.
 
 namespace tensorflow {
 
-typedef Eigen::ThreadPoolDevice CPUDevice;
-typedef Eigen::GpuDevice GPUDevice;
-#ifdef TENSORFLOW_USE_SYCL
-typedef Eigen::SyclDevice SYCLDevice;
-#endif  // TENSORFLOW_USE_SYCL
-
-class Stack : public ResourceBase {
- public:
-  static std::atomic<int64> stack_counter;
-
-  struct TensorAndAllocation {
-    Tensor tensor;
-    AllocatorAttributes alloc_attrs;
-    bool swapped_to_cpu;
-  };
-
-  Stack(const DataType& elem_type, const string& stack_name, int max_size)
-      : elem_type_(elem_type),
-        stack_name_(stack_name),
-        max_size_(max_size),
-        closed_(false) {}
-
-  Status Push(const TensorAndAllocation& value) {
-    mutex_lock l(mu_);
-    TF_RETURN_IF_ERROR(CheckNotClosed());
-    if (max_size_ >= 0 && stack_.size() >= max_size_) {
-      return errors::InvalidArgument("Stack[", stack_name_, "] overflowed ",
-                                     "its max_size (", max_size_, ")");
-    }
-    stack_.push_back(value);
-    return Status::OK();
-  }
-
-  Status Pop(TensorAndAllocation* value) {
-    mutex_lock l(mu_);
-    TF_RETURN_IF_ERROR(CheckNotClosed());
-    if (stack_.empty()) {
-      return errors::InvalidArgument("Stack[", stack_name_,
-                                     "] is empty when calling Pop().");
-    }
-    *value = stack_.back();
-    stack_.pop_back();
-    return Status::OK();
-  }
-
-  // We don't swap the first tensor on the stack and any subsequent tensors
-  // that share the buffer with the first tensor.
-  bool IsUsefulToSwap(const Tensor& tensor) const {
-    mutex_lock l(mu_);
-    if (stack_.empty()) {
-      return false;
-    }
-    const Tensor& first = stack_.front().tensor;
-    return !tensor.SharesBufferWith(first);
-  }
-
-  void Close() {
-    mutex_lock l(mu_);
-    stack_.clear();
-    closed_ = true;
-  }
-
-  DataType ElemType() { return elem_type_; }
-
-  string DebugString() override {
-    mutex_lock l(mu_);
-    return strings::StrCat("Stack[", stack_name_, "]");
-  }
-
-  const string& stack_name() { return stack_name_; }
-
- private:
-  friend class StackOp;
-  mutex* mu() { return &mu_; }
-
-  mutable mutex mu_;
-  DataType elem_type_;
-  const string stack_name_;
-  Tensor handle_;
-  int max_size_;
-  bool closed_ GUARDED_BY(mu_);
-  std::vector<TensorAndAllocation> stack_ GUARDED_BY(mu_);
-
-  Status CheckNotClosed() const EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-    if (closed_) {
-      return errors::InvalidArgument("Stack[", stack_name_,
-                                     "] has already been closed.");
-    }
-    return Status::OK();
-  }
-};
-
-Status GetStack(OpKernelContext* ctx, Stack** stack) {
-  if (ctx->input_dtype(0) == DT_RESOURCE) {
-    return LookupResource(ctx, HandleFromInput(ctx, 0), stack);
-  } else {
-    Tensor Tstack_handle = ctx->mutable_input(0, false);
-    if (Tstack_handle.NumElements() != 2) {
-      return errors::InvalidArgument(
-          "Stack handle must have two elements, but had shape: ",
-          Tstack_handle.shape().DebugString());
-    }
-    const string& container = Tstack_handle.flat<string>()(0);
-    const string& stack_name = Tstack_handle.flat<string>()(1);
-    string key = strings::StrCat(container, stack_name);
-    ResourceMgr* rm = ctx->resource_manager();
-    if (rm == nullptr) {
-      return errors::Internal("No resource manager.");
-    }
-    auto* step_container = ctx->step_container();
-    if (step_container == nullptr) {
-      return errors::Internal("No step container.");
-    }
-    TF_RETURN_IF_ERROR(rm->Lookup(step_container->name(), key, stack));
-    return Status::OK();
-  }
-}
-
-std::atomic<int64> Stack::stack_counter{0};
-
-// A per-run local stack. The stack uses a "per-step" resource manager which
-// ensures that correct garbage collection on error or successful completion.
-class StackOp : public OpKernel {
- public:
-  explicit StackOp(OpKernelConstruction* context) : OpKernel(context) {
-    OP_REQUIRES_OK(context, context->GetAttr("elem_type", &elem_type_));
-    OP_REQUIRES_OK(context, context->GetAttr("stack_name", &stack_name_));
-    if (stack_name_.empty()) stack_name_ = name();
-  }
-
-  void Compute(OpKernelContext* ctx) override {
-    int32 size = std::numeric_limits<int32>::max();
-    if (ctx->num_inputs() > 0) {
-      const Tensor* tensor_size;
-      OP_REQUIRES_OK(ctx, ctx->input("max_size", &tensor_size));
-
-      OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(tensor_size->shape()),
-                  errors::InvalidArgument(
-                      "Stack size must be a scalar, but had shape: ",
-                      tensor_size->shape().DebugString()));
-
-      int32 size_value = tensor_size->scalar<int32>()();
-      if (size_value >= 0) {
-        size = size_value;
-      }
-    }
-
-    static const char kContainer[] = "_stacks";
-    auto stack_id = Stack::stack_counter.fetch_add(1);
-    string stack_name = strings::StrCat(stack_name_, "_", stack_id);
-    // Store the handle in a per-step container.
-    ResourceMgr* rm = ctx->resource_manager();
-    OP_REQUIRES(ctx, rm != nullptr, errors::Internal("No resource manager."));
-    string key = strings::StrCat(kContainer, stack_name);
-    Stack* stack = new Stack(elem_type_, stack_name, size);
-    auto* step_container = ctx->step_container();
-    OP_REQUIRES(ctx, step_container != nullptr,
-                errors::Internal("No step container."));
-    OP_REQUIRES_OK(ctx, rm->Create(step_container->name(), key, stack));
-    if (IsRefType(ctx->expected_output_dtype(0))) {
-      // Create the stack handle.
-      AllocatorAttributes alloc_attr;
-      alloc_attr.set_on_host(true);
-      OP_REQUIRES_OK(ctx, ctx->allocate_temp(tensorflow::DT_STRING,
-                                             tensorflow::TensorShape({2}),
-                                             &stack->handle_, alloc_attr));
-      auto handle = stack->handle_.flat<string>();
-      handle(0) = kContainer;
-      handle(1) = std::move(stack_name);
-      ctx->set_output_ref(0, stack->mu(), &stack->handle_);
-    } else {
-      Tensor* handle;
-      OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &handle));
-      handle->flat<ResourceHandle>()(0) =
-          MakePerStepResourceHandle<Stack>(ctx, key);
-    }
-  }
-
- private:
-  DataType elem_type_;
-  string stack_name_;
-
-  TF_DISALLOW_COPY_AND_ASSIGN(StackOp);
-};
-
 REGISTER_KERNEL_BUILDER(Name("Stack").Device(DEVICE_CPU), StackOp);
 REGISTER_KERNEL_BUILDER(Name("Stack").Device(DEVICE_GPU).HostMemory("handle"),
                         StackOp);
@@ -242,102 +59,22 @@ REGISTER_KERNEL_BUILDER(Name("StackV2")
                         StackOp);
 #endif  // TENSORFLOW_USE_SYCL
 
-template <typename Device>
-class StackPushOp : public AsyncOpKernel {
- public:
-  explicit StackPushOp(OpKernelConstruction* context) : AsyncOpKernel(context) {
-    OP_REQUIRES_OK(context, context->GetAttr("swap_memory", &swap_memory_));
-  }
-
-  void ComputeAsync(OpKernelContext* ctx, DoneCallback done) override {
-    // Get the stack from the handle.
-    Stack* stack = nullptr;
-    OP_REQUIRES_OK_ASYNC(ctx, GetStack(ctx, &stack), done);
-    core::ScopedUnref unref(stack);
-
-    if (ctx->input_dtype(1) != stack->ElemType()) {
-      ctx->CtxFailure(errors::InvalidArgument("Must have type ",
-                                              stack->ElemType(), " but got ",
-                                              ctx->input_dtype(1)));
-      done();
-      return;
-    }
-
-    // Push the tensor onto the stack. Swap the tensor to CPU if instructed.
-    const Tensor& tensor = ctx->input(1);
-    AllocatorAttributes alloc_attrs = ctx->input_alloc_attr(1);
-    // For now, we use a simple heuristic for swapping: A GPU tensor is moved
-    // to CPU if the tensor has more than kCopyThreshold bytes and the GPU
-    // allocator says more than kOccupancy of the memory is in use.
-    static constexpr int kCopyThreshold = 2048;
-    static constexpr double kOccupancy = 0.7;
-    if (swap_memory_ && !alloc_attrs.on_host() &&
-        (std::is_same<Device, GPUDevice>::value
-#ifdef TENSORFLOW_USE_SYCL
-         || std::is_same<Device, SYCLDevice>::value
-#endif  // TENSORFLOW_USE_SYCL
-         ) &&
-        tensor.TotalBytes() > kCopyThreshold && stack->IsUsefulToSwap(tensor)) {
-      DeviceContext* device_ctxt = ctx->op_device_context();
-      auto device = static_cast<tensorflow::Device*>(ctx->device());
-      Allocator* allocator = device->GetAllocator(alloc_attrs);
-      AllocatorStats stats;
-      allocator->GetStats(&stats);
-      if (stats.bytes_in_use > (stats.bytes_limit * kOccupancy)) {
-        // Asynchronously copy the tensor from GPU to CPU memory.
-        // TODO(yuanbyu): Swap the oldest tensor first.
-        AllocatorAttributes host_alloc_attrs;
-        host_alloc_attrs.set_gpu_compatible(true);
-        host_alloc_attrs.set_on_host(true);
-        Allocator* cpu_allocator = device->GetAllocator(host_alloc_attrs);
-        Tensor* cpu_tensor =
-            new Tensor(cpu_allocator, tensor.dtype(), tensor.shape());
-        device_ctxt->CopyDeviceTensorToCPU(
-            &tensor, "StackPush", device, cpu_tensor,
-            [cpu_tensor, stack, ctx, done](const Status& s) {
-              ctx->SetStatus(s);
-              if (s.ok()) {
-                AllocatorAttributes alloc_attrs = ctx->input_alloc_attr(1);
-                ctx->SetStatus(stack->Push({*cpu_tensor, alloc_attrs, true}));
-              }
-              if (ctx->status().ok()) {
-                ctx->set_output(0, *cpu_tensor);
-              }
-              done();
-              delete cpu_tensor;
-            });
-        return;
-      }
-    }
-
-    // Execute synchronously if not swapped.
-    OP_REQUIRES_OK_ASYNC(ctx, stack->Push({tensor, alloc_attrs, false}), done);
-    ctx->set_output(0, tensor);
-    done();
-  }
-
-  bool IsExpensive() override { return false; }
-
- private:
-  bool swap_memory_;
-};
-
 REGISTER_KERNEL_BUILDER(Name("StackPush").Device(DEVICE_CPU),
-                        StackPushOp<CPUDevice>);
+                        TemplatedStackPushOp</*allow_swapping=*/false>);
 REGISTER_KERNEL_BUILDER(Name("StackPushV2").Device(DEVICE_CPU),
-                        StackPushOp<CPUDevice>);
+                        TemplatedStackPushOp</*allow_swapping=*/false>);
 
-#define REGISTER_GPU_KERNEL(type)                         \
-  REGISTER_KERNEL_BUILDER(Name("StackPush")               \
-                              .Device(DEVICE_GPU)         \
-                              .HostMemory("handle")       \
-                              .TypeConstraint<type>("T"), \
-                          StackPushOp<GPUDevice>);        \
-  REGISTER_KERNEL_BUILDER(Name("StackPushV2")             \
-                              .Device(DEVICE_GPU)         \
-                              .HostMemory("handle")       \
-                              .TypeConstraint<type>("T"), \
-                          StackPushOp<GPUDevice>);
+#define REGISTER_GPU_KERNEL(type)                                         \
+  REGISTER_KERNEL_BUILDER(Name("StackPush")                               \
+                              .Device(DEVICE_GPU)                         \
+                              .HostMemory("handle")                       \
+                              .TypeConstraint<type>("T"),                 \
+                          TemplatedStackPushOp</*allow_swapping=*/true>); \
+  REGISTER_KERNEL_BUILDER(Name("StackPushV2")                             \
+                              .Device(DEVICE_GPU)                         \
+                              .HostMemory("handle")                       \
+                              .TypeConstraint<type>("T"),                 \
+                          TemplatedStackPushOp</*allow_swapping=*/true>);
 
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_KERNEL);
 #undef REGISTER_GPU_KERNEL
@@ -345,21 +82,21 @@ TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_KERNEL);
 // Special GPU kernels for int32 and bool.
 // TODO(b/25387198): Also enable int32 in device memory. This kernel
 // registration requires all int32 inputs and outputs to be in host memory.
-#define REGISTER_GPU_HOST_KERNEL(type)                    \
-  REGISTER_KERNEL_BUILDER(Name("StackPush")               \
-                              .Device(DEVICE_GPU)         \
-                              .HostMemory("handle")       \
-                              .HostMemory("elem")         \
-                              .HostMemory("output")       \
-                              .TypeConstraint<type>("T"), \
-                          StackPushOp<GPUDevice>);        \
-  REGISTER_KERNEL_BUILDER(Name("StackPushV2")             \
-                              .Device(DEVICE_GPU)         \
-                              .HostMemory("handle")       \
-                              .HostMemory("elem")         \
-                              .HostMemory("output")       \
-                              .TypeConstraint<type>("T"), \
-                          StackPushOp<GPUDevice>);
+#define REGISTER_GPU_HOST_KERNEL(type)                                    \
+  REGISTER_KERNEL_BUILDER(Name("StackPush")                               \
+                              .Device(DEVICE_GPU)                         \
+                              .HostMemory("handle")                       \
+                              .HostMemory("elem")                         \
+                              .HostMemory("output")                       \
+                              .TypeConstraint<type>("T"),                 \
+                          TemplatedStackPushOp</*allow_swapping=*/true>); \
+  REGISTER_KERNEL_BUILDER(Name("StackPushV2")                             \
+                              .Device(DEVICE_GPU)                         \
+                              .HostMemory("handle")                       \
+                              .HostMemory("elem")                         \
+                              .HostMemory("output")                       \
+                              .TypeConstraint<type>("T"),                 \
+                          TemplatedStackPushOp</*allow_swapping=*/true>);
 
 REGISTER_GPU_HOST_KERNEL(int32);
 REGISTER_GPU_HOST_KERNEL(bool);
@@ -372,7 +109,7 @@ REGISTER_GPU_HOST_KERNEL(bool);
                               .Device(DEVICE_SYCL)        \
                               .HostMemory("handle")       \
                               .TypeConstraint<type>("T"), \
-                          StackPushOp<SYCLDevice>);
+                          TemplatedStackPushOp</*allow_swapping=*/true>);
 
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_SYCL_KERNEL);
 
@@ -383,7 +120,7 @@ TF_CALL_GPU_NUMBER_TYPES(REGISTER_SYCL_KERNEL);
                               .HostMemory("elem")         \
                               .HostMemory("output")       \
                               .TypeConstraint<type>("T"), \
-                          StackPushOp<SYCLDevice>)
+                          TemplatedStackPushOp</*allow_swapping=*/true>)
 
 REGISTER_SYCL_HOST_KERNEL(int32);
 REGISTER_SYCL_HOST_KERNEL(bool);
@@ -391,48 +128,6 @@ REGISTER_SYCL_HOST_KERNEL(bool);
 #undef REGISTER_SYCL_HOST_KERNEL
 #endif  // TENSORFLOW_USE_SYCL
 
-class StackPopOp : public AsyncOpKernel {
- public:
-  explicit StackPopOp(OpKernelConstruction* context) : AsyncOpKernel(context) {}
-
-  void ComputeAsync(OpKernelContext* ctx, DoneCallback done) override {
-    // Get the stack from the handle.
-    Stack* stack = nullptr;
-    OP_REQUIRES_OK_ASYNC(ctx, GetStack(ctx, &stack), done);
-    core::ScopedUnref unref(stack);
-
-    // Pop the tensor. Transfer the tensor back to device if it was
-    // swapped out to CPU.
-    Stack::TensorAndAllocation value;
-    OP_REQUIRES_OK_ASYNC(ctx, stack->Pop(&value), done);
-    if (value.swapped_to_cpu) {
-      // Asynchronously copy the tensor back from CPU to GPU memory.
-      DeviceContext* device_ctxt = ctx->op_device_context();
-      Device* device = static_cast<Device*>(ctx->device());
-      Tensor* cpu_tensor = &value.tensor;
-      Allocator* gpu_allocator = device->GetAllocator(value.alloc_attrs);
-      Tensor* device_tensor =
-          new Tensor(gpu_allocator, cpu_tensor->dtype(), cpu_tensor->shape());
-      device_ctxt->CopyCPUTensorToDevice(
-          cpu_tensor, device, device_tensor,
-          [device_tensor, ctx, done](const Status& s) {
-            ctx->SetStatus(s);
-            if (s.ok()) {
-              ctx->set_output(0, *device_tensor);
-            }
-            done();
-            delete device_tensor;
-          });
-    } else {
-      // Execute synchronously if not swapped.
-      ctx->set_output(0, value.tensor);
-      done();
-    }
-  }
-
-  bool IsExpensive() override { return false; }
-};
-
 REGISTER_KERNEL_BUILDER(Name("StackPop").Device(DEVICE_CPU), StackPopOp);
 REGISTER_KERNEL_BUILDER(Name("StackPopV2").Device(DEVICE_CPU), StackPopOp);
 
@@ -498,20 +193,6 @@ REGISTER_SYCL_HOST_KERNEL(bool);
 #undef REGISTER_SYCL_HOST_KERNEL
 #endif  // TENSORFLOW_USE_SYCL
 
-class StackCloseOp : public OpKernel {
- public:
-  explicit StackCloseOp(OpKernelConstruction* context) : OpKernel(context) {}
-
-  void Compute(OpKernelContext* ctx) override {
-    Stack* stack = nullptr;
-    OP_REQUIRES_OK(ctx, GetStack(ctx, &stack));
-    core::ScopedUnref unref(stack);
-    stack->Close();
-  }
-
-  bool IsExpensive() override { return false; }
-};
-
 REGISTER_KERNEL_BUILDER(Name("StackClose").Device(DEVICE_CPU), StackCloseOp);
 REGISTER_KERNEL_BUILDER(
     Name("StackClose").Device(DEVICE_GPU).HostMemory("handle"), StackCloseOp);
diff --git a/tensorflow/core/lib/core/threadpool.cc b/tensorflow/core/lib/core/threadpool.cc
index 9ccd911b0ef..42689a6c3b3 100644
--- a/tensorflow/core/lib/core/threadpool.cc
+++ b/tensorflow/core/lib/core/threadpool.cc
@@ -192,5 +192,14 @@ int ThreadPool::NumThreads() const { return impl_->NumThreads(); }
 
 int ThreadPool::CurrentThreadId() const { return impl_->CurrentThreadId(); }
 
+void ThreadPool::ScheduleWithHint(std::function<void()> fn, int start,
+                                  int limit) {
+  impl_->ScheduleWithHint(std::move(fn), start, limit);
+}
+
+void ThreadPool::SetStealPartitions(
+    const std::vector<std::pair<unsigned, unsigned>>& partitions) {
+  impl_->SetStealPartitions(partitions);
+}
 }  // namespace thread
 }  // namespace tensorflow
diff --git a/tensorflow/core/lib/core/threadpool.h b/tensorflow/core/lib/core/threadpool.h
index e14ad7ac641..3da7dcb6328 100644
--- a/tensorflow/core/lib/core/threadpool.h
+++ b/tensorflow/core/lib/core/threadpool.h
@@ -59,6 +59,10 @@ class ThreadPool {
   // Schedules fn() for execution in the pool of threads.
   void Schedule(std::function<void()> fn);
 
+  void SetStealPartitions(
+      const std::vector<std::pair<unsigned, unsigned>>& partitions);
+
+  void ScheduleWithHint(std::function<void()> fn, int start, int limit);
   // Requires 0 < block_size <= total.
   // Spawns k threads and calls fn(i*block_size, (i+1)*block_size) from the
   // ith thread (i>=0). When (i+1)*block_size > total, fn(i*block_size, total)
diff --git a/tensorflow/core/ops/boosted_trees_ops.cc b/tensorflow/core/ops/boosted_trees_ops.cc
index b8cf5385548..1c854f66193 100644
--- a/tensorflow/core/ops/boosted_trees_ops.cc
+++ b/tensorflow/core/ops/boosted_trees_ops.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/framework/tensor_shape.h"
 
 namespace tensorflow {
 
@@ -400,10 +401,7 @@ REGISTER_OP("BoostedTreesMakeQuantileSummaries")
       for (int i = 0; i < num_features; ++i) {
         ShapeHandle feature_shape;
         DimensionHandle unused_dim;
-        TF_RETURN_IF_ERROR(c->WithRank(c->input(i), 2, &feature_shape));
-        TF_RETURN_IF_ERROR(c->Merge(c->Dim(feature_shape, 0),
-                                    c->Dim(example_weights_shape, 0),
-                                    &unused_dim));
+        TF_RETURN_IF_ERROR(c->WithRank(c->input(i), 1, &feature_shape));
         // the columns are value, weight, min_rank, max_rank.
         c->set_output(i, c->MakeShape({c->UnknownDim(), 4}));
       }
@@ -431,6 +429,17 @@ REGISTER_OP("BoostedTreesQuantileStreamResourceAddSummaries")
       return Status::OK();
     });
 
+REGISTER_OP("BoostedTreesQuantileStreamResourceDeserialize")
+    .Attr("num_streams: int")
+    .Input("quantile_stream_resource_handle: resource")
+    .Input("bucket_boundaries: num_streams * float")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle unused_input;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &unused_input));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused_input));
+      return Status::OK();
+    });
+
 REGISTER_OP("BoostedTreesQuantileStreamResourceFlush")
     .Attr("generate_quantiles: bool = False")
     .Input("quantile_stream_resource_handle: resource")
@@ -470,13 +479,13 @@ REGISTER_OP("BoostedTreesBucketize")
       ShapeHandle feature_shape;
       DimensionHandle unused_dim;
       for (int i = 0; i < num_features; i++) {
-        TF_RETURN_IF_ERROR(c->WithRank(c->input(i), 2, &feature_shape));
+        TF_RETURN_IF_ERROR(c->WithRank(c->input(i), 1, &feature_shape));
         TF_RETURN_IF_ERROR(c->Merge(c->Dim(feature_shape, 0),
                                     c->Dim(c->input(0), 0), &unused_dim));
       }
       // Bucketized result should have same dimension as input.
       for (int i = 0; i < num_features; i++) {
-        c->set_output(i, c->MakeShape({c->Dim(c->input(i), 0), 1}));
+        c->set_output(i, c->MakeShape({c->Dim(c->input(i), 0)}));
       }
       return Status::OK();
     });
diff --git a/tensorflow/core/ops/compat/ops_history.v1.pbtxt b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
index 916deabd6ff..bfcc92dcb0f 100644
--- a/tensorflow/core/ops/compat/ops_history.v1.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
@@ -11765,6 +11765,25 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "BoostedTreesQuantileStreamResourceDeserialize"
+  input_arg {
+    name: "quantile_stream_resource_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "bucket_boundaries"
+    type: DT_FLOAT
+    number_attr: "num_streams"
+  }
+  attr {
+    name: "num_streams"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
 op {
   name: "BoostedTreesQuantileStreamResourceFlush"
   input_arg {
@@ -20924,6 +20943,10 @@ op {
     name: "element_shape"
     type_attr: "shape_type"
   }
+  input_arg {
+    name: "max_num_elements"
+    type: DT_INT32
+  }
   output_arg {
     name: "handle"
     type: DT_VARIANT
@@ -23504,6 +23527,38 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "FixedLengthRecordDatasetV2"
+  input_arg {
+    name: "filenames"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "header_bytes"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "record_bytes"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "footer_bytes"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "buffer_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "compression_type"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  is_stateful: true
+}
 op {
   name: "FixedLengthRecordReader"
   output_arg {
@@ -61536,17 +61591,6 @@ op {
     }
   }
 }
-op {
-  name: "SinkDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-}
 op {
   name: "Size"
   input_arg {
diff --git a/tensorflow/core/ops/dataset_ops.cc b/tensorflow/core/ops/dataset_ops.cc
index 98a76962611..8402f250f9f 100644
--- a/tensorflow/core/ops/dataset_ops.cc
+++ b/tensorflow/core/ops/dataset_ops.cc
@@ -674,6 +674,29 @@ REGISTER_OP("FixedLengthRecordDataset")
       return shape_inference::ScalarShape(c);
     });
 
+REGISTER_OP("FixedLengthRecordDatasetV2")
+    .Input("filenames: string")
+    .Input("header_bytes: int64")
+    .Input("record_bytes: int64")
+    .Input("footer_bytes: int64")
+    .Input("buffer_size: int64")
+    .Input("compression_type: string")
+    .Output("handle: variant")
+    .SetIsStateful()  // TODO(b/65524810): Source dataset ops must be marked
+                      // stateful to inhibit constant folding.
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle unused;
+      // `filenames` must be a scalar or a vector.
+      TF_RETURN_IF_ERROR(c->WithRankAtMost(c->input(0), 1, &unused));
+      // header_bytes, record_bytes, footer_bytes, buffer_size should be
+      // scalars.
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &unused));
+      return shape_inference::ScalarShape(c);
+    });
+
 REGISTER_OP("TFRecordDataset")
     .Input("filenames: string")
     .Input("compression_type: string")
@@ -867,11 +890,6 @@ REGISTER_OP("DatasetToGraph")
     .Output("graph: string")
     .SetShapeFn(shape_inference::ScalarShape);
 
-REGISTER_OP("SinkDataset")
-    .Input("input_dataset: variant")
-    .Output("handle: variant")
-    .SetShapeFn(shape_inference::ScalarShape);
-
 REGISTER_OP("OptimizeDataset")
     .Input("input_dataset: variant")
     .Input("optimizations: string")
diff --git a/tensorflow/core/ops/experimental_dataset_ops.cc b/tensorflow/core/ops/experimental_dataset_ops.cc
index 088d1865ddf..9733cf27768 100644
--- a/tensorflow/core/ops/experimental_dataset_ops.cc
+++ b/tensorflow/core/ops/experimental_dataset_ops.cc
@@ -75,6 +75,17 @@ REGISTER_OP("ExperimentalIgnoreErrorsDataset")
     .Attr("output_shapes: list(shape) >= 1")
     .SetShapeFn(shape_inference::ScalarShape);
 
+REGISTER_OP("ExperimentalMapDataset")
+    .Input("input_dataset: variant")
+    .Input("other_arguments: Targuments")
+    .Output("handle: variant")
+    .Attr("f: func")
+    .Attr("Targuments: list(type) >= 0")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .Attr("use_inter_op_parallelism: bool = true")
+    .SetShapeFn(shape_inference::ScalarShape);
+
 REGISTER_OP("ExperimentalNonSerializableDataset")
     .Input("input_dataset: variant")
     .Output("handle: variant")
diff --git a/tensorflow/core/ops/function_ops.cc b/tensorflow/core/ops/function_ops.cc
index 6edd86b3ad0..8e86dd9f780 100644
--- a/tensorflow/core/ops/function_ops.cc
+++ b/tensorflow/core/ops/function_ops.cc
@@ -35,6 +35,22 @@ output: The argument.
 index: This argument is the index-th argument of the function.
 )doc");
 
+REGISTER_SYSTEM_OP("_DeviceArg")
+    .Output("output: T")
+    .Attr("T: type")
+    .Attr("index: int >= 0")
+    .SetIsStateful()
+    .SetShapeFn([](shape_inference::InferenceContext* context) {
+      context->set_output(0, context->UnknownShape());
+      return Status::OK();
+    })
+    .Doc(R"doc(
+A graph node which represents an argument to a function.
+
+output: The argument.
+index: This argument is the index-th argument of the function.
+)doc");
+
 REGISTER_SYSTEM_OP("_Retval")
     .Input("input: T")
     .Attr("T: type")
diff --git a/tensorflow/core/ops/list_ops.cc b/tensorflow/core/ops/list_ops.cc
index 7d79df9c1cc..88d6d14c306 100644
--- a/tensorflow/core/ops/list_ops.cc
+++ b/tensorflow/core/ops/list_ops.cc
@@ -22,6 +22,7 @@ namespace {
 
 REGISTER_OP("EmptyTensorList")
     .Input("element_shape: shape_type")
+    .Input("max_num_elements: int32")
     .Output("handle: variant")
     .Attr("element_dtype: type")
     .Attr("shape_type: {int32, int64}")
diff --git a/tensorflow/core/ops/mkl_nn_ops.cc b/tensorflow/core/ops/mkl_nn_ops.cc
new file mode 100644
index 00000000000..9be3470820e
--- /dev/null
+++ b/tensorflow/core/ops/mkl_nn_ops.cc
@@ -0,0 +1,612 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/numeric_op.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/util/mirror_pad_mode.h"
+#include "tensorflow/core/util/padding.h"
+#include "tensorflow/core/util/tensor_format.h"
+
+// For now, this file only includes MKL quantized ops. In the
+// future, we will move all other MKL ops from nn_ops.cc to this file.
+
+#ifdef INTEL_MKL
+
+namespace tensorflow {
+
+using shape_inference::DimensionHandle;
+using shape_inference::InferenceContext;
+using shape_inference::ShapeHandle;
+
+REGISTER_OP("_MklQuantizedMaxPool")
+    .Input("input:         T")
+    .Input("min_input:     float")
+    .Input("max_input:     float")
+    .Input("mkl_input:     uint8")
+    .Input("mkl_min_input: uint8")
+    .Input("mkl_max_input: uint8")
+    .Output("output:       T")
+    .Output("min_output:   float")
+    .Output("max_output:   float")
+    .Output("mkl_output:     uint8")
+    .Output("mkl_min_output: uint8")
+    .Output("mkl_max_output: uint8")
+    .Attr("T: quantizedtype")
+    .Attr("ksize: list(int) >= 4")
+    .Attr("strides: list(int) >= 4")
+    .Attr(GetPaddingAttrString())
+    .SetShapeFn(shape_inference::MaxPoolShape)
+    .Doc(R"doc(
+MKL version of QuantizedMaxPool operator. Uses MKL DNN APIs to perform max pooling
+on the quantized input.
+
+NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
+expected to invoke these operators.
+)doc");
+
+REGISTER_OP("_MklQuantizedAvgPool")
+    .Input("input:           T")
+    .Input("min_input:       float")
+    .Input("max_input:       float")
+    .Input("mkl_input:       uint8")
+    .Input("mkl_min_input:   uint8")
+    .Input("mkl_max_input:   uint8")
+    .Output("output:         T")
+    .Output("min_output:     float")
+    .Output("max_output:     float")
+    .Output("mkl_output:     uint8")
+    .Output("mkl_min_output: uint8")
+    .Output("mkl_max_output: uint8")
+    .Attr("T: quantizedtype")
+    .Attr("ksize: list(int) >= 4")
+    .Attr("strides: list(int) >= 4")
+    .Attr(GetPaddingAttrString())
+    .SetShapeFn([](InferenceContext* c) {
+      TF_RETURN_IF_ERROR(shape_inference::AvgPoolShape(c));
+      ShapeHandle unused;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
+      c->set_output(1, c->Scalar());
+      c->set_output(2, c->Scalar());
+      return Status::OK();
+    })
+    .Doc(R"doc(
+MKL version of QuantizedAvgPool operator. Uses MKL DNN APIs to perform average pooling
+on the quantized input.
+
+NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
+expected to invoke these operators.
+)doc");
+
+REGISTER_OP("_MklQuantizedConv2D")
+    .Input("input: Tinput")
+    .Input("filter: Tfilter")
+    .Input("min_input: float")
+    .Input("max_input: float")
+    .Input("min_filter: float")
+    .Input("max_filter: float")
+    .Input("mkl_input: uint8")
+    .Input("mkl_filter: uint8")
+    .Input("mkl_min_input: uint8")
+    .Input("mkl_max_input: uint8")
+    .Input("mkl_min_filter: uint8")
+    .Input("mkl_max_filter: uint8")
+    .Output("output: out_type")
+    .Output("min_output: float")
+    .Output("max_output: float")
+    .Output("mkl_output: uint8")
+    .Output("mkl_min_output: uint8")
+    .Output("mkl_max_output: uint8")
+    .Attr("Tinput: quantizedtype")
+    .Attr("Tfilter: quantizedtype")
+    .Attr("T: quantizedtype")  // Additional attribute "T" for enabling MklToTf
+                               // conversion
+    .Attr("out_type: quantizedtype = DT_QINT32")
+    .Attr("data_format: string = 'NHWC'")
+    .Attr("strides: list(int)")
+    .Attr(GetPaddingAttrString())
+    .Attr("dilations: list(int) = [1, 1, 1, 1]")
+    .SetShapeFn([](InferenceContext* c) {
+      TF_RETURN_IF_ERROR(shape_inference::Conv2DShape(c));
+      ShapeHandle unused;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(5), 0, &unused));
+      c->set_output(1, c->Scalar());
+      c->set_output(2, c->Scalar());
+      return Status::OK();
+    });
+
+REGISTER_OP("_MklQuantizedConv2DAndRequantize")
+    .Input("input: Tinput")
+    .Input("filter: Tfilter")
+    .Input("min_input: float")
+    .Input("max_input: float")
+    .Input("min_filter: float")
+    .Input("max_filter: float")
+    .Input("min_freezed_output: float")
+    .Input("max_freezed_output: float")
+    .Input("mkl_input: uint8")
+    .Input("mkl_filter: uint8")
+    .Input("mkl_min_input: uint8")
+    .Input("mkl_max_input: uint8")
+    .Input("mkl_min_filter: uint8")
+    .Input("mkl_max_filter: uint8")
+    .Input("mkl_min_freezed_output: uint8")
+    .Input("mkl_max_freezed_output: uint8")
+    .Output("output: out_type")
+    .Output("min_output: float")
+    .Output("max_output: float")
+    .Output("mkl_output: uint8")
+    .Output("mkl_min_output: uint8")
+    .Output("mkl_max_output: uint8")
+    .Attr("Tinput: quantizedtype")
+    .Attr("Tfilter: quantizedtype")
+    .Attr("T: quantizedtype")  // Additional attribute "T" for enabling MklToTf
+                               // conversion
+    .Attr("out_type: quantizedtype = DT_QINT8")
+    .Attr("data_format: string = 'NHWC'")
+    .Attr("strides: list(int)")
+    .Attr(GetPaddingAttrString())
+    .Attr("dilations: list(int) = [1, 1, 1, 1]")
+    .SetShapeFn([](InferenceContext* c) {
+      TF_RETURN_IF_ERROR(shape_inference::Conv2DShape(c));
+      ShapeHandle unused;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(5), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(6), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(7), 0, &unused));
+      c->set_output(1, c->Scalar());
+      c->set_output(2, c->Scalar());
+      return Status::OK();
+    });
+
+REGISTER_OP("_MklQuantizedConv2DWithBias")
+    .Input("input: Tinput")
+    .Input("filter: Tfilter")
+    .Input("bias: float")
+    .Input("min_input: float")
+    .Input("max_input: float")
+    .Input("min_filter: float")
+    .Input("max_filter: float")
+    .Input("mkl_input: uint8")
+    .Input("mkl_filter: uint8")
+    .Input("mkl_bias: uint8")
+    .Input("mkl_min_input: uint8")
+    .Input("mkl_max_input: uint8")
+    .Input("mkl_min_filter: uint8")
+    .Input("mkl_max_filter: uint8")
+    .Output("output: out_type")
+    .Output("min_output: float")
+    .Output("max_output: float")
+    .Output("mkl_output: uint8")
+    .Output("mkl_min_output: uint8")
+    .Output("mkl_max_output: uint8")
+    .Attr("Tinput: quantizedtype")
+    .Attr("Tfilter: quantizedtype")
+    .Attr("T: quantizedtype")  // Additional attribute "T" for
+                               // enabling MklToTf conversion
+    .Attr("out_type: quantizedtype = DT_QINT32")
+    .Attr("data_format: string = 'NHWC'")
+    .Attr("strides: list(int)")
+    .Attr(GetPaddingAttrString())
+    .Attr("dilations: list(int) = [1, 1, 1, 1]")
+    .SetShapeFn([](InferenceContext* c) {
+      TF_RETURN_IF_ERROR(shape_inference::Conv2DShape(c));
+      ShapeHandle unused;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 1, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(5), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(6), 0, &unused));
+      c->set_output(1, c->Scalar());
+      c->set_output(2, c->Scalar());
+      return Status::OK();
+    });
+
+REGISTER_OP("_MklQuantizedConv2DWithBiasAndRequantize")
+    .Input("input: Tinput")
+    .Input("filter: Tfilter")
+    .Input("bias: Tbias")
+    .Input("min_input: float")
+    .Input("max_input: float")
+    .Input("min_filter: float")
+    .Input("max_filter: float")
+    .Input("min_freezed_output: float")
+    .Input("max_freezed_output: float")
+    .Input("mkl_input: uint8")
+    .Input("mkl_filter: uint8")
+    .Input("mkl_bias: uint8")
+    .Input("mkl_min_input: uint8")
+    .Input("mkl_max_input: uint8")
+    .Input("mkl_min_filter: uint8")
+    .Input("mkl_max_filter: uint8")
+    .Input("mkl_min_freezed_output: uint8")
+    .Input("mkl_max_freezed_output: uint8")
+    .Output("output: out_type")
+    .Output("min_output: float")
+    .Output("max_output: float")
+    .Output("mkl_output: uint8")
+    .Output("mkl_min_output: uint8")
+    .Output("mkl_max_output: uint8")
+    .Attr("Tinput: quantizedtype")
+    .Attr("Tfilter: quantizedtype")
+    .Attr("Tbias: {float, qint32}")
+    .Attr("T: quantizedtype")  // Additional attribute "T" for
+                               // enabling MklToTf conversion
+    .Attr("out_type: quantizedtype = DT_QINT8")
+    .Attr("data_format: string = 'NHWC'")
+    .Attr("strides: list(int)")
+    .Attr(GetPaddingAttrString())
+    .Attr("dilations: list(int) = [1, 1, 1, 1]")
+    .SetShapeFn([](InferenceContext* c) {
+      TF_RETURN_IF_ERROR(shape_inference::Conv2DShape(c));
+      ShapeHandle unused;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 1, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(5), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(6), 0, &unused));
+      c->set_output(1, c->Scalar());
+      c->set_output(2, c->Scalar());
+      return Status::OK();
+    });
+
+REGISTER_OP("_MklQuantizedConv2DAndRelu")
+    .Input("input: Tinput")
+    .Input("filter: Tfilter")
+    .Input("min_input: float")
+    .Input("max_input: float")
+    .Input("min_filter: float")
+    .Input("max_filter: float")
+    .Input("mkl_input: uint8")
+    .Input("mkl_filter: uint8")
+    .Input("mkl_min_input: uint8")
+    .Input("mkl_max_input: uint8")
+    .Input("mkl_min_filter: uint8")
+    .Input("mkl_max_filter: uint8")
+    .Output("output: out_type")
+    .Output("min_output: float")
+    .Output("max_output: float")
+    .Output("mkl_output: uint8")
+    .Output("mkl_min_output: uint8")
+    .Output("mkl_max_output: uint8")
+    .Attr("Tinput: quantizedtype")
+    .Attr("Tfilter: quantizedtype")
+    .Attr("T: quantizedtype")  // Additional attribute "T" for enabling MklToTf
+                               // conversion
+    .Attr("out_type: quantizedtype = DT_QINT32")
+    .Attr("data_format: string = 'NHWC'")
+    .Attr("strides: list(int)")
+    .Attr(GetPaddingAttrString())
+    .Attr("dilations: list(int) = [1, 1, 1, 1]")
+    .SetShapeFn([](InferenceContext* c) {
+      TF_RETURN_IF_ERROR(shape_inference::Conv2DShape(c));
+      ShapeHandle unused;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(5), 0, &unused));
+      c->set_output(1, c->Scalar());
+      c->set_output(2, c->Scalar());
+      return Status::OK();
+    });
+
+REGISTER_OP("_MklQuantizedConv2DAndReluAndRequantize")
+    .Input("input: Tinput")
+    .Input("filter: Tfilter")
+    .Input("min_input: float")
+    .Input("max_input: float")
+    .Input("min_filter: float")
+    .Input("max_filter: float")
+    .Input("min_freezed_output: float")
+    .Input("max_freezed_output: float")
+    .Input("mkl_input: uint8")
+    .Input("mkl_filter: uint8")
+    .Input("mkl_min_input: uint8")
+    .Input("mkl_max_input: uint8")
+    .Input("mkl_min_filter: uint8")
+    .Input("mkl_max_filter: uint8")
+    .Input("mkl_min_freezed_output: uint8")
+    .Input("mkl_max_freezed_output: uint8")
+    .Output("output: out_type")
+    .Output("min_output: float")
+    .Output("max_output: float")
+    .Output("mkl_output: uint8")
+    .Output("mkl_min_output: uint8")
+    .Output("mkl_max_output: uint8")
+    .Attr("Tinput: quantizedtype")
+    .Attr("Tfilter: quantizedtype")
+    .Attr("T: quantizedtype")  // Additional attribute "T" for enabling MklToTf
+                               // conversion
+    .Attr("out_type: quantizedtype = DT_QUINT8")
+    .Attr("data_format: string = 'NHWC'")
+    .Attr("strides: list(int)")
+    .Attr(GetPaddingAttrString())
+    .Attr("dilations: list(int) = [1, 1, 1, 1]")
+    .SetShapeFn([](InferenceContext* c) {
+      TF_RETURN_IF_ERROR(shape_inference::Conv2DShape(c));
+      ShapeHandle unused;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(5), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(6), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(7), 0, &unused));
+      c->set_output(1, c->Scalar());
+      c->set_output(2, c->Scalar());
+      return Status::OK();
+    });
+
+REGISTER_OP("_MklQuantizedConv2DWithBiasAndRelu")
+    .Input("input: Tinput")
+    .Input("filter: Tfilter")
+    .Input("bias: float")
+    .Input("min_input: float")
+    .Input("max_input: float")
+    .Input("min_filter: float")
+    .Input("max_filter: float")
+    .Input("mkl_input: uint8")
+    .Input("mkl_filter: uint8")
+    .Input("mkl_bias: uint8")
+    .Input("mkl_min_input: uint8")
+    .Input("mkl_max_input: uint8")
+    .Input("mkl_min_filter: uint8")
+    .Input("mkl_max_filter: uint8")
+    .Output("output: out_type")
+    .Output("min_output: float")
+    .Output("max_output: float")
+    .Output("mkl_output: uint8")
+    .Output("mkl_min_output: uint8")
+    .Output("mkl_max_output: uint8")
+    .Attr("Tinput: quantizedtype")
+    .Attr("Tfilter: quantizedtype")
+    .Attr("T: quantizedtype")  // Additional attribute "T" for
+                               // enabling MklToTf conversion
+    .Attr("out_type: quantizedtype = DT_QINT32")
+    .Attr("data_format: string = 'NHWC'")
+    .Attr("strides: list(int)")
+    .Attr(GetPaddingAttrString())
+    .Attr("dilations: list(int) = [1, 1, 1, 1]")
+    .SetShapeFn([](InferenceContext* c) {
+      TF_RETURN_IF_ERROR(shape_inference::Conv2DShape(c));
+      ShapeHandle unused;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 1, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(5), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(6), 0, &unused));
+      c->set_output(1, c->Scalar());
+      c->set_output(2, c->Scalar());
+      return Status::OK();
+    });
+
+REGISTER_OP("_MklQuantizedConv2DWithBiasAndReluAndRequantize")
+    .Input("input: Tinput")
+    .Input("filter: Tfilter")
+    .Input("bias: Tbias")
+    .Input("min_input: float")
+    .Input("max_input: float")
+    .Input("min_filter: float")
+    .Input("max_filter: float")
+    .Input("min_freezed_output: float")
+    .Input("max_freezed_output: float")
+    .Input("mkl_input: uint8")
+    .Input("mkl_filter: uint8")
+    .Input("mkl_bias: uint8")
+    .Input("mkl_min_input: uint8")
+    .Input("mkl_max_input: uint8")
+    .Input("mkl_min_filter: uint8")
+    .Input("mkl_max_filter: uint8")
+    .Input("mkl_min_freezed_output: uint8")
+    .Input("mkl_max_freezed_output: uint8")
+    .Output("output: out_type")
+    .Output("min_output: float")
+    .Output("max_output: float")
+    .Output("mkl_output: uint8")
+    .Output("mkl_min_output: uint8")
+    .Output("mkl_max_output: uint8")
+    .Attr("Tinput: quantizedtype")
+    .Attr("Tfilter: quantizedtype")
+    .Attr("Tbias: {float, qint32}")
+    .Attr("T: quantizedtype")  // Additional attribute "T" for
+                               // enabling MklToTf conversion
+    .Attr("out_type: quantizedtype = DT_QUINT8")
+    .Attr("data_format: string = 'NHWC'")
+    .Attr("strides: list(int)")
+    .Attr(GetPaddingAttrString())
+    .Attr("dilations: list(int) = [1, 1, 1, 1]")
+    .SetShapeFn([](InferenceContext* c) {
+      TF_RETURN_IF_ERROR(shape_inference::Conv2DShape(c));
+      ShapeHandle unused;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 1, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(5), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(6), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(7), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(8), 0, &unused));
+      c->set_output(1, c->Scalar());
+      c->set_output(2, c->Scalar());
+      return Status::OK();
+    });
+
+REGISTER_OP("_MklQuantizedConv2DWithBiasSumAndRelu")
+    .Input("input: Tinput")
+    .Input("filter: Tfilter")
+    .Input("bias: float")
+    .Input("min_input: float")
+    .Input("max_input: float")
+    .Input("min_filter: float")
+    .Input("max_filter: float")
+    .Input("summand: float")
+    .Input("mkl_input: uint8")
+    .Input("mkl_filter: uint8")
+    .Input("mkl_bias: uint8")
+    .Input("mkl_min_input: uint8")
+    .Input("mkl_max_input: uint8")
+    .Input("mkl_min_filter: uint8")
+    .Input("mkl_max_filter: uint8")
+    .Input("mkl_summand: uint8")
+    .Output("output: out_type")
+    .Output("min_output: float")
+    .Output("max_output: float")
+    .Output("mkl_output: uint8")
+    .Output("mkl_min_output: uint8")
+    .Output("mkl_max_output: uint8")
+    .Attr("Tinput: quantizedtype")
+    .Attr("Tfilter: quantizedtype")
+    .Attr("T: quantizedtype")  // Additional attribute "T" for
+                               // enabling MklToTf conversion
+    .Attr("out_type: quantizedtype = DT_QINT32")
+    .Attr("data_format: string = 'NHWC'")
+    .Attr("strides: list(int)")
+    .Attr(GetPaddingAttrString())
+    .Attr("dilations: list(int) = [1, 1, 1, 1]")
+    .SetShapeFn([](InferenceContext* c) {
+      TF_RETURN_IF_ERROR(shape_inference::Conv2DShape(c));
+      ShapeHandle unused;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 1, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(5), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(6), 0, &unused));
+      c->set_output(1, c->Scalar());
+      c->set_output(2, c->Scalar());
+      return Status::OK();
+    });
+
+REGISTER_OP("_MklQuantizedConv2DWithBiasSumAndReluAndRequantize")
+    .Input("input: Tinput")
+    .Input("filter: Tfilter")
+    .Input("bias: Tbias")
+    .Input("min_input: float")
+    .Input("max_input: float")
+    .Input("min_filter: float")
+    .Input("max_filter: float")
+    .Input("min_freezed_output: float")
+    .Input("max_freezed_output: float")
+    .Input("summand: Tsummand")
+    .Input("min_summand: float")
+    .Input("max_summand: float")
+    .Input("mkl_input: uint8")
+    .Input("mkl_filter: uint8")
+    .Input("mkl_bias: uint8")
+    .Input("mkl_min_input: uint8")
+    .Input("mkl_max_input: uint8")
+    .Input("mkl_min_filter: uint8")
+    .Input("mkl_max_filter: uint8")
+    .Input("mkl_min_freezed_output: uint8")
+    .Input("mkl_max_freezed_output: uint8")
+    .Input("mkl_summand: uint8")
+    .Input("mkl_min_summand: uint8")
+    .Input("mkl_max_summand: uint8")
+    .Output("output: out_type")
+    .Output("min_output: float")
+    .Output("max_output: float")
+    .Output("mkl_output: uint8")
+    .Output("mkl_min_output: uint8")
+    .Output("mkl_max_output: uint8")
+    .Attr("Tinput: quantizedtype")
+    .Attr("Tfilter: quantizedtype")
+    .Attr("Tbias: {float, qint32}")
+    .Attr("Tsummand: quantizedtype")
+    .Attr("T: quantizedtype")  // Additional attribute "T" for
+                               // enabling MklToTf conversion
+    .Attr("out_type: quantizedtype = DT_QUINT8")
+    .Attr("data_format: string = 'NHWC'")
+    .Attr("strides: list(int)")
+    .Attr(GetPaddingAttrString())
+    .Attr("dilations: list(int) = [1, 1, 1, 1]")
+    .SetShapeFn([](InferenceContext* c) {
+      TF_RETURN_IF_ERROR(shape_inference::Conv2DShape(c));
+      ShapeHandle unused;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 1, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(5), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(6), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(7), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(8), 0, &unused));
+      c->set_output(1, c->Scalar());
+      c->set_output(2, c->Scalar());
+      return Status::OK();
+    });
+
+REGISTER_OP("_MklQuantizedConv2DWithBiasSignedSumAndReluAndRequantize")
+    .Input("input: Tinput")
+    .Input("filter: Tfilter")
+    .Input("bias: Tbias")
+    .Input("min_input: float")
+    .Input("max_input: float")
+    .Input("min_filter: float")
+    .Input("max_filter: float")
+    .Input("min_freezed_output: float")
+    .Input("max_freezed_output: float")
+    .Input("summand: Tsummand")
+    .Input("min_summand: float")
+    .Input("max_summand: float")
+    .Input("mkl_input: uint8")
+    .Input("mkl_filter: uint8")
+    .Input("mkl_bias: uint8")
+    .Input("mkl_min_input: uint8")
+    .Input("mkl_max_input: uint8")
+    .Input("mkl_min_filter: uint8")
+    .Input("mkl_max_filter: uint8")
+    .Input("mkl_min_freezed_output: uint8")
+    .Input("mkl_max_freezed_output: uint8")
+    .Input("mkl_summand: uint8")
+    .Input("mkl_min_summand: uint8")
+    .Input("mkl_max_summand: uint8")
+    .Output("output: out_type")
+    .Output("min_output: float")
+    .Output("max_output: float")
+    .Output("mkl_output: uint8")
+    .Output("mkl_min_output: uint8")
+    .Output("mkl_max_output: uint8")
+    .Attr("Tinput: quantizedtype")
+    .Attr("Tfilter: quantizedtype")
+    .Attr("Tbias: {float, qint32}")
+    .Attr("Tsummand: quantizedtype")
+    .Attr("T: quantizedtype")  // Additional attribute "T" for
+                               // enabling MklToTf conversion
+    .Attr("out_type: quantizedtype = DT_QUINT8")
+    .Attr("data_format: string = 'NHWC'")
+    .Attr("strides: list(int)")
+    .Attr(GetPaddingAttrString())
+    .Attr("dilations: list(int) = [1, 1, 1, 1]")
+    .SetShapeFn([](InferenceContext* c) {
+      TF_RETURN_IF_ERROR(shape_inference::Conv2DShape(c));
+      ShapeHandle unused;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 1, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(5), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(6), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(7), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(8), 0, &unused));
+      c->set_output(1, c->Scalar());
+      c->set_output(2, c->Scalar());
+      return Status::OK();
+    });
+
+}  // namespace tensorflow
+
+#endif  // INTEL_MKL
diff --git a/tensorflow/core/ops/nn_ops.cc b/tensorflow/core/ops/nn_ops.cc
index 9796587709b..4dfd95b0191 100644
--- a/tensorflow/core/ops/nn_ops.cc
+++ b/tensorflow/core/ops/nn_ops.cc
@@ -315,6 +315,24 @@ REGISTER_OP("Conv2DBackpropFilter")
       return Status::OK();
     });
 
+REGISTER_OP("_FusedConv2D")
+    .Input("input: T")
+    .Input("filter: T")
+    .Input("args: num_args * T")
+    .Output("output: T")
+    .Attr("T: {float, double}")
+    .Attr("num_args: int >= 0")
+    .Attr("strides: list(int)")
+    .Attr(GetPaddingAttrString())
+    .Attr(GetConvnetDataFormatAttrString())
+    .Attr("dilations: list(int) = [1, 1, 1, 1]")
+    .Attr("fused_ops: list(string) = []")
+    .SetShapeFn(shape_inference::Conv2DShape)
+    .Doc(R"doc(
+*NOTE*: Do not invoke this operator directly in Python. Grappler is
+expected to create these operators.
+)doc");
+
 namespace {
 
 Status CommonFusedConvCalculations(InferenceContext* c, bool has_resize) {
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index f590794f305..430212ee1d1 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -4633,6 +4633,25 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "BoostedTreesQuantileStreamResourceDeserialize"
+  input_arg {
+    name: "quantile_stream_resource_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "bucket_boundaries"
+    type: DT_FLOAT
+    number_attr: "num_streams"
+  }
+  attr {
+    name: "num_streams"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
 op {
   name: "BoostedTreesQuantileStreamResourceFlush"
   input_arg {
@@ -9605,6 +9624,10 @@ op {
     name: "element_shape"
     type_attr: "shape_type"
   }
+  input_arg {
+    name: "max_num_elements"
+    type: DT_INT32
+  }
   output_arg {
     name: "handle"
     type: DT_VARIANT
@@ -11306,6 +11329,38 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "FixedLengthRecordDatasetV2"
+  input_arg {
+    name: "filenames"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "header_bytes"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "record_bytes"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "footer_bytes"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "buffer_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "compression_type"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  is_stateful: true
+}
 op {
   name: "FixedLengthRecordReader"
   output_arg {
@@ -29563,17 +29618,6 @@ op {
     }
   }
 }
-op {
-  name: "SinkDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-}
 op {
   name: "Size"
   input_arg {
diff --git a/tensorflow/core/platform/default/logging.cc b/tensorflow/core/platform/default/logging.cc
index c6e5777c265..34db4901067 100644
--- a/tensorflow/core/platform/default/logging.cc
+++ b/tensorflow/core/platform/default/logging.cc
@@ -133,8 +133,16 @@ LogMessage::~LogMessage() {
 }
 
 int64 LogMessage::MinVLogLevel() {
+  // We don't want to print logs during fuzzing as that would slow fuzzing down
+  // by almost 2x. So, if we are in fuzzing mode (not just running a test), we
+  // return maximum value so that nothing is actually printed
+  // See also http://llvm.org/docs/LibFuzzer.html#fuzzer-friendly-build-mode
+#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
+  return tensorflow::NUM_SEVERITIES;
+#else
   static int64 min_vlog_level = MinVLogLevelFromEnv();
   return min_vlog_level;
+#endif
 }
 
 LogMessageFatal::LogMessageFatal(const char* file, int line)
diff --git a/tensorflow/core/platform/env_time.cc b/tensorflow/core/platform/env_time.cc
index 76a227b69a1..10ba2abe7cb 100644
--- a/tensorflow/core/platform/env_time.cc
+++ b/tensorflow/core/platform/env_time.cc
@@ -1,4 +1,4 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/tensorflow/core/protobuf/rewriter_config.proto b/tensorflow/core/protobuf/rewriter_config.proto
index 143df115f42..d68f2735365 100644
--- a/tensorflow/core/protobuf/rewriter_config.proto
+++ b/tensorflow/core/protobuf/rewriter_config.proto
@@ -137,6 +137,11 @@ message RewriterConfig {
   // meta-optimizer or when manually specified through the optimizers field.
   AutoParallelOptions auto_parallel = 5;
 
+  // If true, any optimization pass failing will cause the MetaOptimizer to
+  // stop with an error. By default - or when set to false, failing passes are
+  // skipped silently.
+  bool fail_on_optimizer_errors = 21;
+
   ScopedAllocatorOptions scoped_allocator_opts = 16;
 
   // If non-empty, will use this as an alternative way to specify a list of
diff --git a/tensorflow/core/util/cuda_kernel_helper_test.cu.cc b/tensorflow/core/util/cuda_kernel_helper_test.cu.cc
index 732ed33ede1..2b035ab0e9c 100644
--- a/tensorflow/core/util/cuda_kernel_helper_test.cu.cc
+++ b/tensorflow/core/util/cuda_kernel_helper_test.cu.cc
@@ -131,7 +131,7 @@ class CudaLaunchConfigTest : public ::testing::Test {
  protected:
   const int bufsize = 1024;
   int* outbuf = nullptr;
-  Eigen::CudaStreamDevice stream;
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice d = Eigen::GpuDevice(&stream);
 
   virtual void SetUp() {
diff --git a/tensorflow/core/util/cuda_launch_config.h b/tensorflow/core/util/cuda_launch_config.h
index d0d95736d3f..080d4067cec 100644
--- a/tensorflow/core/util/cuda_launch_config.h
+++ b/tensorflow/core/util/cuda_launch_config.h
@@ -128,12 +128,12 @@ inline CudaLaunchConfig GetCudaLaunchConfig(int work_element_count,
   CudaLaunchConfig config;
   const int virtual_thread_count = work_element_count;
   const int physical_thread_count = std::min(
-      d.getNumCudaMultiProcessors() * d.maxCudaThreadsPerMultiProcessor(),
+      d.getNumGpuMultiProcessors() * d.maxGpuThreadsPerMultiProcessor(),
       virtual_thread_count);
-  const int thread_per_block = std::min(1024, d.maxCudaThreadsPerBlock());
+  const int thread_per_block = std::min(1024, d.maxGpuThreadsPerBlock());
   const int block_count =
       std::min(DivUp(physical_thread_count, thread_per_block),
-               d.getNumCudaMultiProcessors());
+               d.getNumGpuMultiProcessors());
 
   config.virtual_thread_count = virtual_thread_count;
   config.thread_per_block = thread_per_block;
@@ -184,7 +184,7 @@ inline CudaLaunchConfig GetCudaLaunchConfigFixedBlockSize(
   cudaError_t err = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
       &block_count, func, fixed_block_size, dynamic_shared_memory_size);
   CHECK_EQ(err, cudaSuccess);
-  block_count = std::min(block_count * d.getNumCudaMultiProcessors(),
+  block_count = std::min(block_count * d.getNumGpuMultiProcessors(),
                          DivUp(work_element_count, fixed_block_size));
 
   config.virtual_thread_count = work_element_count;
@@ -213,7 +213,7 @@ inline Cuda2DLaunchConfig GetCuda2DLaunchConfig(int xdim, int ydim,
   int block_rows = std::max(kThreadsPerBlock / block_cols, 1);
 
   const int physical_thread_count =
-      d.getNumCudaMultiProcessors() * d.maxCudaThreadsPerMultiProcessor();
+      d.getNumGpuMultiProcessors() * d.maxGpuThreadsPerMultiProcessor();
 
   const int max_blocks = std::max(physical_thread_count / kThreadsPerBlock, 1);
 
diff --git a/tensorflow/core/util/mkl_util.h b/tensorflow/core/util/mkl_util.h
index 79883647166..b7a6e0b6902 100644
--- a/tensorflow/core/util/mkl_util.h
+++ b/tensorflow/core/util/mkl_util.h
@@ -17,8 +17,8 @@ limitations under the License.
 #define TENSORFLOW_CORE_UTIL_MKL_UTIL_H_
 #ifdef INTEL_MKL
 
-#include <string>
 #include <memory>
+#include <string>
 #include <unordered_map>
 #include <utility>
 #include <vector>
@@ -54,9 +54,9 @@ limitations under the License.
 #include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/util/env_var.h"
 #include "tensorflow/core/util/padding.h"
 #include "tensorflow/core/util/tensor_format.h"
-#include "tensorflow/core/util/env_var.h"
 
 #ifndef INTEL_MKL_ML_ONLY
 #include "mkldnn.hpp"
@@ -83,7 +83,12 @@ namespace tensorflow {
 // MKL operation, and did not go through a conversion to a standard
 // Tensorflow tensor.
 
+// For use with MKL ML, has been deprecated
 typedef enum { W = 0, H = 1, C = 2, N = 3 } MklDims;
+
+// The dimensions order that MKL DNN internally uses for 2D activations
+// [Batch, Channel, Height, Width] and
+// for 2D filters [Out_Channel, In_Channel, Height, Width].
 typedef enum {
   Dim_N = 0,
   Dim_C = 1,
@@ -93,6 +98,9 @@ typedef enum {
   Dim_I = 1
 } MklDnnDims;
 
+// The dimensions order that MKL DNN internally uses for 3D activations
+// [Batch, Channel, Depth, Height, Width] and
+// for 3D filters [Out_Channel, In_Channel, Depth, Height, Width].
 typedef enum {
   Dim3d_N = 0,
   Dim3d_C = 1,
@@ -103,6 +111,13 @@ typedef enum {
   Dim3d_I = 1
 } MklDnnDims3D;
 
+// Enum used to templatize MklOp kernel implementations
+// that support both fp32 and int8 versions.
+enum class MklQuantization {
+  QUANTIZED_VERSION,
+  FP_VERSION,
+};
+
 static const int kSmallBatchSize = 32;
 
 #ifdef INTEL_MKL_ML_ONLY
@@ -653,7 +668,6 @@ class MklDnnShape {
     }
   }
 
-
   inline void SetTfDimOrder(const size_t dimension, memory::format format) {
     TensorFormat data_format = MklDnnDataFormatToTFDataFormat(format);
     SetTfDimOrder(dimension, data_format);
@@ -782,7 +796,8 @@ inline Tensor ConvertMklToTF(OpKernelContext* context, const Tensor& mkl_tensor,
 }
 #else
 using mkldnn::stream;
-template <typename T> class MklDnnData;
+template <typename T>
+class MklDnnData;
 
 template <typename T>
 inline Tensor ConvertMklToTF(OpKernelContext* context, const Tensor& mkl_tensor,
@@ -792,11 +807,12 @@ inline Tensor ConvertMklToTF(OpKernelContext* context, const Tensor& mkl_tensor,
     if (!mkl_shape.IsMklTensor())
       return mkl_tensor;  // return input since it is already TF tensor
 
-    TensorShape output_shape = mkl_shape.GetTfShape();;
+    TensorShape output_shape = mkl_shape.GetTfShape();
+    ;
 
     // Allocate output tensor.
-    context->allocate_temp(DataTypeToEnum<T>::v(),
-        output_shape, &output_tensor);
+    context->allocate_temp(DataTypeToEnum<T>::v(), output_shape,
+                           &output_tensor);
 
     auto cpu_engine = engine(engine::cpu, 0);
     MklDnnData<T> input(&cpu_engine);
@@ -811,7 +827,7 @@ inline Tensor ConvertMklToTF(OpKernelContext* context, const Tensor& mkl_tensor,
     if (input.IsReorderNeeded(output_tf_pd)) {
       std::vector<primitive> net;
       CHECK_EQ(input.CheckReorderToOpMem(output_tf_pd, &output_tensor, &net),
-             true);
+               true);
       stream(stream::kind::eager).submit(net).wait();
     } else {
       // If not, just forward input tensor to output tensor.
@@ -1386,6 +1402,18 @@ template <>
 memory::data_type MklDnnType<float>() {
   return memory::data_type::f32;
 }
+template <>
+memory::data_type MklDnnType<quint8>() {
+  return memory::data_type::u8;
+}
+template <>
+memory::data_type MklDnnType<qint8>() {
+  return memory::data_type::s8;
+}
+template <>
+memory::data_type MklDnnType<qint32>() {
+  return memory::data_type::s32;
+}
 
 /// Map TensorFlow's data format into MKL-DNN 3D data format
 /// @input: TensorFlow data format
@@ -2003,8 +2031,7 @@ const mkldnn::memory::dims NONE_DIMS = {};
 template <typename T>
 class MklPrimitiveFactory {
  public:
-  MklPrimitiveFactory() {
-  }
+  MklPrimitiveFactory() {}
 
   ~MklPrimitiveFactory() {}
 
@@ -2032,8 +2059,8 @@ class MklPrimitiveFactory {
   /// For those legacy device(w/o AVX512 and AVX2),
   /// MKL-DNN GEMM will be used.
   static inline bool IsLegacyPlatform() {
-    return (!port::TestCPUFeature(port::CPUFeature::AVX512F)
-                   && !port::TestCPUFeature(port::CPUFeature::AVX2));
+    return (!port::TestCPUFeature(port::CPUFeature::AVX512F) &&
+            !port::TestCPUFeature(port::CPUFeature::AVX2));
   }
 
   /// Fuction to check whether primitive memory optimization is enabled
@@ -2054,15 +2081,13 @@ class MklPrimitiveFactory {
 // utility class for creating keys of MKL primitive pool.
 class FactoryKeyCreator {
  public:
-  FactoryKeyCreator() {
-    key_.reserve(kMaxKeyLength);
-  }
+  FactoryKeyCreator() { key_.reserve(kMaxKeyLength); }
 
   ~FactoryKeyCreator() {}
 
   void AddAsKey(const string& str) { Append(str); }
 
-  void AddAsKey(const mkldnn::memory::dims &dims) {
+  void AddAsKey(const mkldnn::memory::dims& dims) {
     for (unsigned int i = 0; i < dims.size(); i++) {
       AddAsKey<int>(dims[i]);
     }
@@ -2070,7 +2095,7 @@ class FactoryKeyCreator {
 
   template <typename T>
   void AddAsKey(const T data) {
-    auto buffer = reinterpret_cast<const char *>(&data);
+    auto buffer = reinterpret_cast<const char*>(&data);
     Append(StringPiece(buffer, sizeof(T)));
   }
 
@@ -2086,7 +2111,6 @@ class FactoryKeyCreator {
   }
 };
 
-
 static inline memory::format get_desired_format(int channel,
                                                 bool is_2d = true) {
   memory::format fmt_desired = memory::format::any;
@@ -2108,37 +2132,34 @@ class MklReorderPrimitive : public MklPrimitive {
   explicit MklReorderPrimitive(const memory* from, const memory* to) {
     Setup(from, to);
   }
-    ~MklReorderPrimitive() {}
+  ~MklReorderPrimitive() {}
 
-    std::shared_ptr<primitive> GetPrimitive() {
-      return context_.reorder_prim;
-    }
+  std::shared_ptr<primitive> GetPrimitive() { return context_.reorder_prim; }
 
-    void SetMemory(const memory* from, const memory* to) {
-      context_.src_mem->set_data_handle(from->get_data_handle());
-      context_.dst_mem->set_data_handle(to->get_data_handle());
-    }
+  void SetMemory(const memory* from, const memory* to) {
+    context_.src_mem->set_data_handle(from->get_data_handle());
+    context_.dst_mem->set_data_handle(to->get_data_handle());
+  }
 
  private:
-    struct ReorderContext {
-      std::shared_ptr<mkldnn::memory> src_mem;
-      std::shared_ptr<mkldnn::memory> dst_mem;
-      std::shared_ptr<primitive> reorder_prim;
-      ReorderContext():
-        src_mem(nullptr), dst_mem(nullptr), reorder_prim(nullptr) {
-      }
-    } context_;
+  struct ReorderContext {
+    std::shared_ptr<mkldnn::memory> src_mem;
+    std::shared_ptr<mkldnn::memory> dst_mem;
+    std::shared_ptr<primitive> reorder_prim;
+    ReorderContext()
+        : src_mem(nullptr), dst_mem(nullptr), reorder_prim(nullptr) {}
+  } context_;
 
-    engine cpu_engine_ = engine(engine::cpu, 0);
+  engine cpu_engine_ = engine(engine::cpu, 0);
 
-    void Setup(const memory* from, const memory* to) {
-      context_.src_mem.reset(new memory(
-            {from->get_primitive_desc().desc(), cpu_engine_}, DummyData));
-      context_.dst_mem.reset(new memory(
-            {to->get_primitive_desc().desc(), cpu_engine_}, DummyData));
-      context_.reorder_prim = std::make_shared<mkldnn::reorder>(
-          reorder(*context_.src_mem, *context_.dst_mem));
-    }
+  void Setup(const memory* from, const memory* to) {
+    context_.src_mem.reset(new memory(
+        {from->get_primitive_desc().desc(), cpu_engine_}, DummyData));
+    context_.dst_mem.reset(
+        new memory({to->get_primitive_desc().desc(), cpu_engine_}, DummyData));
+    context_.reorder_prim = std::make_shared<mkldnn::reorder>(
+        reorder(*context_.src_mem, *context_.dst_mem));
+  }
 };
 
 template <typename T>
@@ -2156,52 +2177,51 @@ class MklReorderPrimitiveFactory : public MklPrimitiveFactory<T> {
     return reorderPrim;
   }
 
-    static MklReorderPrimitiveFactory & GetInstance() {
-      static MklReorderPrimitiveFactory instance_;
-      return instance_;
-    }
+  static MklReorderPrimitiveFactory& GetInstance() {
+    static MklReorderPrimitiveFactory instance_;
+    return instance_;
+  }
 
  private:
-    MklReorderPrimitiveFactory() {}
-    ~MklReorderPrimitiveFactory() {}
+  MklReorderPrimitiveFactory() {}
+  ~MklReorderPrimitiveFactory() {}
 
-    static string CreateKey(const memory* from, const memory* to) {
-      string prefix = "reorder";
-      FactoryKeyCreator key_creator;
-      auto const &from_desc =  from->get_primitive_desc().desc().data;
-      auto const &to_desc =  to->get_primitive_desc().desc().data;
-      const int KIdxFirstStride = 0;
-      memory::dims from_dims(from_desc.dims, &from_desc.dims[from_desc.ndims]);
-      memory::dims to_dims(to_desc.dims, &to_desc.dims[to_desc.ndims]);
-      memory::dims from_strides(
-          from_desc.layout_desc.blocking.strides[KIdxFirstStride],
-          &from_desc.layout_desc.blocking
-               .strides[KIdxFirstStride][from_desc.ndims]);
-      memory::dims to_strides(
-          to_desc.layout_desc.blocking.strides[KIdxFirstStride],
-          &to_desc.layout_desc.blocking
-               .strides[KIdxFirstStride][to_desc.ndims]);
-      key_creator.AddAsKey(prefix);
-      key_creator.AddAsKey(static_cast<int>(from_desc.format));
-      key_creator.AddAsKey(static_cast<int>(from_desc.data_type));
-      key_creator.AddAsKey(from_dims);
-      key_creator.AddAsKey(from_strides);
-      key_creator.AddAsKey(static_cast<int>(to_desc.format));
-      key_creator.AddAsKey(static_cast<int>(to_desc.data_type));
-      key_creator.AddAsKey(to_dims);
-      key_creator.AddAsKey(to_strides);
-      return key_creator.GetKey();
-    }
+  static string CreateKey(const memory* from, const memory* to) {
+    string prefix = "reorder";
+    FactoryKeyCreator key_creator;
+    auto const& from_desc = from->get_primitive_desc().desc().data;
+    auto const& to_desc = to->get_primitive_desc().desc().data;
+    const int KIdxFirstStride = 0;
+    memory::dims from_dims(from_desc.dims, &from_desc.dims[from_desc.ndims]);
+    memory::dims to_dims(to_desc.dims, &to_desc.dims[to_desc.ndims]);
+    memory::dims from_strides(
+        from_desc.layout_desc.blocking.strides[KIdxFirstStride],
+        &from_desc.layout_desc.blocking
+             .strides[KIdxFirstStride][from_desc.ndims]);
+    memory::dims to_strides(
+        to_desc.layout_desc.blocking.strides[KIdxFirstStride],
+        &to_desc.layout_desc.blocking.strides[KIdxFirstStride][to_desc.ndims]);
+    key_creator.AddAsKey(prefix);
+    key_creator.AddAsKey(static_cast<int>(from_desc.format));
+    key_creator.AddAsKey(static_cast<int>(from_desc.data_type));
+    key_creator.AddAsKey(from_dims);
+    key_creator.AddAsKey(from_strides);
+    key_creator.AddAsKey(static_cast<int>(to_desc.format));
+    key_creator.AddAsKey(static_cast<int>(to_desc.data_type));
+    key_creator.AddAsKey(to_dims);
+    key_creator.AddAsKey(to_strides);
+    return key_creator.GetKey();
+  }
 
-    MklPrimitive* GetReorder(const memory* from, const memory* to) {
-      string key = CreateKey(from, to);
-      return this->GetOp(key);
-    }
+  MklPrimitive* GetReorder(const memory* from, const memory* to) {
+    string key = CreateKey(from, to);
+    return this->GetOp(key);
+  }
 
-    void SetReorder(const memory* from, const memory* to, MklPrimitive* op) {
-      string key = CreateKey(from, to);
-      this->SetOp(key, op);
-    }
+  void SetReorder(const memory* from, const memory* to, MklPrimitive* op) {
+    string key = CreateKey(from, to);
+    this->SetOp(key, op);
+  }
 };
 
 /// Fuction to find(or create) a reorder from memory pointed by
diff --git a/tensorflow/examples/saved_model/saved_model_half_plus_two.py b/tensorflow/examples/saved_model/saved_model_half_plus_two.py
index e1231708fa0..dfdde445404 100644
--- a/tensorflow/examples/saved_model/saved_model_half_plus_two.py
+++ b/tensorflow/examples/saved_model/saved_model_half_plus_two.py
@@ -160,7 +160,7 @@ def _generate_saved_model_for_half_plus_two(export_dir,
 
       x2 = tf.identity(tf_example["x2"], name="x2")
       y3 = tf.add(tf.multiply(a, x2), c)
-      y2 = tf.identity(y3, name="y3")
+      y3 = tf.identity(y3, name="y3")
 
     # Create an assets file that can be saved and restored as part of the
     # SavedModel.
diff --git a/tensorflow/examples/tf2_showcase/BUILD b/tensorflow/examples/tf2_showcase/BUILD
new file mode 100644
index 00000000000..922bc96b25b
--- /dev/null
+++ b/tensorflow/examples/tf2_showcase/BUILD
@@ -0,0 +1,32 @@
+licenses(["notice"])  # Apache 2.0
+
+package(
+    default_visibility = ["//visibility:private"],
+)
+
+test_suite(
+    name = "all_tests",
+    tags = [
+        "manual",
+        "no_oss",
+        "notap",
+    ],
+    tests = [
+        ":mnist",
+    ],
+)
+
+py_test(
+    name = "mnist",
+    srcs = ["mnist.py"],
+    tags = [
+        "manual",
+        "no_oss",
+        "notap",
+    ],
+    deps = [
+        "//tensorflow:tensorflow_py",
+        "//third_party/py/absl:app",
+        "//third_party/py/absl/flags",
+    ],
+)
diff --git a/tensorflow/examples/tf2_showcase/README.md b/tensorflow/examples/tf2_showcase/README.md
new file mode 100644
index 00000000000..8211fb1d30d
--- /dev/null
+++ b/tensorflow/examples/tf2_showcase/README.md
@@ -0,0 +1,25 @@
+# TF 2.0 Showcase
+
+The code here shows idiomatic ways to write TensorFlow 2.0 code. It doubles as
+an integration test.
+
+## General guidelines for showcase code:
+
+- Code should minimize dependencies and be self-contained in one file. A user
+  should be able to copy-paste the example code into their project and have it
+  just work.
+- Code should emphasize simplicity over performance, as long as it performs
+  within a factor of 2-3x of the optimized implementation.
+- Code should work on CPU and single GPU.
+- Code should run in Python 3.
+- Code should conform to the [Google Python Style Guide](https://github.com/google/styleguide/blob/gh-pages/pyguide.md)
+
+
+- Code should follow these guidelines:
+  - Prefer Keras.
+  - Split code into separate input pipeline and model code segments.
+  - Don't use tf.cond or tf.while_loop; instead, make use of AutoGraph's
+    functionality to compile Python `for`, `while`, and `if` statements.
+  - Prefer a simple training loop over Estimator
+  - Save and restore a SavedModel.
+  - Write basic TensorBoard metrics - loss, accuracy,
diff --git a/tensorflow/examples/tf2_showcase/mnist.py b/tensorflow/examples/tf2_showcase/mnist.py
new file mode 100644
index 00000000000..a4bfe4e53a8
--- /dev/null
+++ b/tensorflow/examples/tf2_showcase/mnist.py
@@ -0,0 +1,262 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""MNIST model training with TensorFlow eager execution.
+
+See:
+https://research.googleblog.com/2017/10/eager-execution-imperative-define-by.html
+
+This program demonstrates training, export, and inference of a convolutional
+neural network model with eager execution enabled.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import time
+
+from absl import app
+from absl import flags
+import numpy as np
+import tensorflow as tf
+
+tfe = tf.contrib.eager
+
+flags.DEFINE_integer(
+    name='log_interval',
+    default=10,
+    help='batches between logging training status')
+
+flags.DEFINE_float(name='learning_rate', default=0.01, help='Learning rate.')
+
+flags.DEFINE_float(
+    name='momentum', short_name='m', default=0.5, help='SGD momentum.')
+
+flags.DEFINE_integer(
+    name='batch_size',
+    default=100,
+    help='Batch size to use during training / eval')
+
+flags.DEFINE_integer(
+    name='train_epochs', default=10, help='Number of epochs to train')
+
+flags.DEFINE_string(
+    name='model_dir',
+    default='/tmp/tensorflow/mnist',
+    help='Where to save checkpoints, tensorboard summaries, etc.')
+
+flags.DEFINE_bool(
+    name='clean',
+    default=False,
+    help='Whether to clear model directory before training')
+
+FLAGS = flags.FLAGS
+
+
+def create_model():
+  """Model to recognize digits in the MNIST dataset.
+
+  Network structure is equivalent to:
+  https://github.com/tensorflow/tensorflow/blob/r1.5/tensorflow/examples/tutorials/mnist/mnist_deep.py
+  and
+  https://github.com/tensorflow/models/blob/master/tutorials/image/mnist/convolutional.py
+  But uses the tf.keras API.
+  Returns:
+    A tf.keras.Model.
+  """
+  # Assumes data_format == 'channel_last'.
+  # See https://www.tensorflow.org/performance/performance_guide#data_formats
+
+  input_shape = [28, 28, 1]
+
+  l = tf.keras.layers
+  max_pool = l.MaxPooling2D((2, 2), (2, 2), padding='same')
+  # The model consists of a sequential chain of layers, so tf.keras.Sequential
+  # (a subclass of tf.keras.Model) makes for a compact description.
+  model = tf.keras.Sequential(
+      [
+          l.Reshape(
+              target_shape=input_shape,
+              input_shape=(28 * 28,)),
+          l.Conv2D(2, 5, padding='same', activation=tf.nn.relu),
+          max_pool,
+          l.Conv2D(4, 5, padding='same', activation=tf.nn.relu),
+          max_pool,
+          l.Flatten(),
+          l.Dense(32, activation=tf.nn.relu),
+          l.Dropout(0.4),
+          l.Dense(10)
+      ])
+  # TODO(brianklee): Remove when @kaftan makes this happen by default.
+  # TODO(brianklee): remove `autograph=True` when kwarg default is flipped.
+  model.call = tfe.function(model.call, autograph=True)
+  # Needs to have input_signature specified in order to be exported
+  # since model.predict() is never called before saved_model.export()
+  # TODO(brianklee): Update with input signature, depending on how the impl of
+  # saved_model.restore() pans out.
+  model.predict = tfe.function(model.predict, autograph=True)
+  # ,input_signature=(tensor_spec.TensorSpec(shape=[28, 28, None], dtype=tf.float32),) # pylint: disable=line-too-long
+  return model
+
+
+def mnist_datasets():
+  (x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data()
+  # Numpy defaults to dtype=float64; TF defaults to float32. Stick with float32.
+  x_train, x_test = x_train / np.float32(255), x_test / np.float32(255)
+  y_train, y_test = y_train.astype(np.int64), y_test.astype(np.int64)
+  train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train))
+  test_dataset = tf.data.Dataset.from_tensor_slices((x_test, y_test))
+  return train_dataset, test_dataset
+
+
+def loss(logits, labels):
+  return tf.reduce_mean(
+      tf.nn.sparse_softmax_cross_entropy_with_logits(
+          logits=logits, labels=labels))
+
+
+def compute_accuracy(logits, labels):
+  predictions = tf.argmax(logits, axis=1, output_type=tf.int64)
+  labels = tf.cast(labels, tf.int64)
+  return tf.reduce_mean(
+      tf.cast(tf.equal(predictions, labels), dtype=tf.float32))
+
+
+# TODO(brianklee): Enable @tf.function on the training loop when zip, enumerate
+# are supported by autograph.
+def train(model, optimizer, dataset, step_counter, log_interval=None,
+          num_steps=None):
+  """Trains model on `dataset` using `optimizer`."""
+  start = time.time()
+  for (batch, (images, labels)) in enumerate(dataset):
+    if num_steps is not None and batch > num_steps:
+      break
+    with tf.contrib.summary.record_summaries_every_n_global_steps(
+        10, global_step=step_counter):
+      # Record the operations used to compute the loss given the input,
+      # so that the gradient of the loss with respect to the variables
+      # can be computed.
+      with tf.GradientTape() as tape:
+        logits = model(images, training=True)
+        loss_value = loss(logits, labels)
+        tf.contrib.summary.scalar('loss', loss_value)
+        tf.contrib.summary.scalar('accuracy', compute_accuracy(logits, labels))
+      grads = tape.gradient(loss_value, model.variables)
+      optimizer.apply_gradients(
+          zip(grads, model.variables), global_step=step_counter)
+      if log_interval and batch % log_interval == 0:
+        rate = log_interval / (time.time() - start)
+        print('Step #%d\tLoss: %.6f (%d steps/sec)' % (batch, loss_value, rate))
+        start = time.time()
+
+
+def test(model, dataset):
+  """Perform an evaluation of `model` on the examples from `dataset`."""
+  avg_loss = tfe.metrics.Mean('loss', dtype=tf.float32)
+  accuracy = tfe.metrics.Accuracy('accuracy', dtype=tf.float32)
+
+  for (images, labels) in dataset:
+    logits = model(images, training=False)
+    avg_loss(loss(logits, labels))
+    accuracy(
+        tf.argmax(logits, axis=1, output_type=tf.int64),
+        tf.cast(labels, tf.int64))
+  print('Test set: Average loss: %.4f, Accuracy: %4f%%\n' %
+        (avg_loss.result(), 100 * accuracy.result()))
+  with tf.contrib.summary.always_record_summaries():
+    tf.contrib.summary.scalar('loss', avg_loss.result())
+    tf.contrib.summary.scalar('accuracy', accuracy.result())
+
+
+def train_and_export(flags_obj):
+  """Run MNIST training and eval loop in eager mode.
+
+  Args:
+    flags_obj: An object containing parsed flag values.
+  """
+  # Load the datasets
+  train_ds, test_ds = mnist_datasets()
+  train_ds = train_ds.shuffle(60000).batch(flags_obj.batch_size)
+  test_ds = test_ds.batch(flags_obj.batch_size)
+
+  # Create the model and optimizer
+  model = create_model()
+  optimizer = tf.train.MomentumOptimizer(
+      flags_obj.learning_rate, flags_obj.momentum)
+
+  # See summaries with `tensorboard --logdir=<model_dir>`
+  train_dir = os.path.join(flags_obj.model_dir, 'summaries', 'train')
+  test_dir = os.path.join(flags_obj.model_dir, 'summaries', 'eval')
+  summary_writer = tf.contrib.summary.create_file_writer(
+      train_dir, flush_millis=10000)
+  test_summary_writer = tf.contrib.summary.create_file_writer(
+      test_dir, flush_millis=10000, name='test')
+
+  # Create and restore checkpoint (if one exists on the path)
+  checkpoint_dir = os.path.join(flags_obj.model_dir, 'checkpoints')
+  checkpoint_prefix = os.path.join(checkpoint_dir, 'ckpt')
+  step_counter = tf.train.get_or_create_global_step()
+  checkpoint = tf.train.Checkpoint(
+      model=model, optimizer=optimizer, step_counter=step_counter)
+  # Restore variables on creation if a checkpoint exists.
+  checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))
+
+  # Train and evaluate for a set number of epochs.
+  for _ in range(flags_obj.train_epochs):
+    start = time.time()
+    with summary_writer.as_default():
+      train(model, optimizer, train_ds, step_counter,
+            flags_obj.log_interval, num_steps=1)
+    end = time.time()
+    print('\nTrain time for epoch #%d (%d total steps): %f' %
+          (checkpoint.save_counter.numpy() + 1,
+           step_counter.numpy(),
+           end - start))
+    with test_summary_writer.as_default():
+      test(model, test_ds)
+    checkpoint.save(checkpoint_prefix)
+
+  # TODO(brianklee): Enable this functionality after @allenl implements this.
+  # export_path = os.path.join(flags_obj.model_dir, 'export')
+  # tf.saved_model.save(export_path, model)
+
+
+def import_and_eval(flags_obj):
+  export_path = os.path.join(flags_obj.model_dir, 'export')
+  model = tf.saved_model.restore(export_path)
+  _, (x_test, y_test) = tf.keras.datasets.mnist.load_data()
+  x_test = x_test / np.float32(255)
+  y_predict = model(x_test)
+  accuracy = compute_accuracy(y_predict, y_test)
+  print('Model accuracy: {:0.2f}%'.format(accuracy.numpy() * 100))
+
+
+def apply_clean(flags_obj):
+  if flags_obj.clean and tf.gfile.Exists(flags_obj.model_dir):
+    tf.logging.info('--clean flag set. Removing existing model dir: {}'.format(
+        flags_obj.model_dir))
+    tf.gfile.DeleteRecursively(flags_obj.model_dir)
+
+
+def main(_):
+  apply_clean(flags.FLAGS)
+  train_and_export(flags.FLAGS)
+  # TODO(brianklee): Enable this functionality after @allenl implements this.
+  # import_and_eval(flags.FLAGS)
+
+
+if __name__ == '__main__':
+  app.run(main)
diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index d65063fe794..baf43f84f8e 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -5547,24 +5547,6 @@ func OrderedMapPeek(scope *Scope, key tf.Output, indices tf.Output, dtypes []tf.
 	return values
 }
 
-// Returns the truth value of x OR y element-wise.
-//
-// *NOTE*: `LogicalOr` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func LogicalOr(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "LogicalOr",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Compute the regularized incomplete beta integral \\(I_x(a, b)\\).
 //
 // The regularized incomplete beta integral is defined as:
@@ -8372,6 +8354,53 @@ func MutexV2(scope *Scope, optional ...MutexV2Attr) (resource tf.Output) {
 	return op.Output(0)
 }
 
+// AvgPool3DAttr is an optional argument to AvgPool3D.
+type AvgPool3DAttr func(optionalAttr)
+
+// AvgPool3DDataFormat sets the optional data_format attribute to value.
+//
+// value: The data format of the input and output data. With the
+// default format "NDHWC", the data is stored in the order of:
+//     [batch, in_depth, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCDHW", the data storage order is:
+//     [batch, in_channels, in_depth, in_height, in_width].
+// If not specified, defaults to "NDHWC"
+func AvgPool3DDataFormat(value string) AvgPool3DAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Performs 3D average pooling on the input.
+//
+// Arguments:
+//	input: Shape `[batch, depth, rows, cols, channels]` tensor to pool over.
+//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
+// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
+//
+// Returns The average pooled output tensor.
+func AvgPool3D(scope *Scope, input tf.Output, ksize []int64, strides []int64, padding string, optional ...AvgPool3DAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "AvgPool3D",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Returns element-wise remainder of division. This emulates C semantics in that
 //
 // the result here is consistent with a truncating divide. E.g.
@@ -16738,7 +16767,7 @@ func StringJoin(scope *Scope, inputs []tf.Output, optional ...StringJoinAttr) (o
 // handle: an empty tensor list.
 // element_dtype: the type of elements in the list.
 // element_shape: a shape compatible with that of elements in the list.
-func EmptyTensorList(scope *Scope, element_shape tf.Output, element_dtype tf.DataType) (handle tf.Output) {
+func EmptyTensorList(scope *Scope, element_shape tf.Output, max_num_elements tf.Output, element_dtype tf.DataType) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -16746,7 +16775,7 @@ func EmptyTensorList(scope *Scope, element_shape tf.Output, element_dtype tf.Dat
 	opspec := tf.OpSpec{
 		Type: "EmptyTensorList",
 		Input: []tf.Input{
-			element_shape,
+			element_shape, max_num_elements,
 		},
 		Attrs: attrs,
 	}
@@ -22611,6 +22640,24 @@ func QuantizeDownAndShrinkRange(scope *Scope, input tf.Output, input_min tf.Outp
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
+// Returns the truth value of x OR y element-wise.
+//
+// *NOTE*: `LogicalOr` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func LogicalOr(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "LogicalOr",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Selects elements from `x` or `y`, depending on `condition`.
 //
 // The `x`, and `y` tensors must all have the same shape, and the
@@ -30025,6 +30072,83 @@ func UnbatchDataset(scope *Scope, input_dataset tf.Output, output_types []tf.Dat
 	return op.Output(0)
 }
 
+// OrderedMapStageAttr is an optional argument to OrderedMapStage.
+type OrderedMapStageAttr func(optionalAttr)
+
+// OrderedMapStageCapacity sets the optional capacity attribute to value.
+//
+// value: Maximum number of elements in the Staging Area. If > 0, inserts
+// on the container will block when the capacity is reached.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func OrderedMapStageCapacity(value int64) OrderedMapStageAttr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
+	}
+}
+
+// OrderedMapStageMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func OrderedMapStageMemoryLimit(value int64) OrderedMapStageAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// OrderedMapStageContainer sets the optional container attribute to value.
+//
+// value: If non-empty, this queue is placed in the given container. Otherwise,
+// a default container is used.
+// If not specified, defaults to ""
+func OrderedMapStageContainer(value string) OrderedMapStageAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// OrderedMapStageSharedName sets the optional shared_name attribute to value.
+//
+// value: It is necessary to match this name to the matching Unstage Op.
+// If not specified, defaults to ""
+func OrderedMapStageSharedName(value string) OrderedMapStageAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Stage (key, values) in the underlying container which behaves like a ordered
+//
+// associative container.   Elements are ordered by key.
+//
+// Arguments:
+//	key: int64
+//
+//	values: a list of tensors
+// dtypes A list of data types that inserted values should adhere to.
+//
+//
+// Returns the created operation.
+func OrderedMapStage(scope *Scope, key tf.Output, indices tf.Output, values []tf.Output, dtypes []tf.DataType, optional ...OrderedMapStageAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtypes": dtypes}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "OrderedMapStage",
+		Input: []tf.Input{
+			key, indices, tf.OutputList(values),
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
 // RpcAttr is an optional argument to Rpc.
 type RpcAttr func(optionalAttr)
 
@@ -30144,83 +30268,6 @@ func Rpc(scope *Scope, address tf.Output, method tf.Output, request tf.Output, o
 	return op.Output(0)
 }
 
-// OrderedMapStageAttr is an optional argument to OrderedMapStage.
-type OrderedMapStageAttr func(optionalAttr)
-
-// OrderedMapStageCapacity sets the optional capacity attribute to value.
-//
-// value: Maximum number of elements in the Staging Area. If > 0, inserts
-// on the container will block when the capacity is reached.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func OrderedMapStageCapacity(value int64) OrderedMapStageAttr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
-	}
-}
-
-// OrderedMapStageMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func OrderedMapStageMemoryLimit(value int64) OrderedMapStageAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
-	}
-}
-
-// OrderedMapStageContainer sets the optional container attribute to value.
-//
-// value: If non-empty, this queue is placed in the given container. Otherwise,
-// a default container is used.
-// If not specified, defaults to ""
-func OrderedMapStageContainer(value string) OrderedMapStageAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// OrderedMapStageSharedName sets the optional shared_name attribute to value.
-//
-// value: It is necessary to match this name to the matching Unstage Op.
-// If not specified, defaults to ""
-func OrderedMapStageSharedName(value string) OrderedMapStageAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Stage (key, values) in the underlying container which behaves like a ordered
-//
-// associative container.   Elements are ordered by key.
-//
-// Arguments:
-//	key: int64
-//
-//	values: a list of tensors
-// dtypes A list of data types that inserted values should adhere to.
-//
-//
-// Returns the created operation.
-func OrderedMapStage(scope *Scope, key tf.Output, indices tf.Output, values []tf.Output, dtypes []tf.DataType, optional ...OrderedMapStageAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "OrderedMapStage",
-		Input: []tf.Input{
-			key, indices, tf.OutputList(values),
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
 // StackPushV2Attr is an optional argument to StackPushV2.
 type StackPushV2Attr func(optionalAttr)
 
@@ -31554,73 +31601,6 @@ func Cross(scope *Scope, a tf.Output, b tf.Output) (product tf.Output) {
 	return op.Output(0)
 }
 
-// AvgPool3DAttr is an optional argument to AvgPool3D.
-type AvgPool3DAttr func(optionalAttr)
-
-// AvgPool3DDataFormat sets the optional data_format attribute to value.
-//
-// value: The data format of the input and output data. With the
-// default format "NDHWC", the data is stored in the order of:
-//     [batch, in_depth, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCDHW", the data storage order is:
-//     [batch, in_channels, in_depth, in_height, in_width].
-// If not specified, defaults to "NDHWC"
-func AvgPool3DDataFormat(value string) AvgPool3DAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// Performs 3D average pooling on the input.
-//
-// Arguments:
-//	input: Shape `[batch, depth, rows, cols, channels]` tensor to pool over.
-//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
-// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-//	padding: The type of padding algorithm to use.
-//
-// Returns The average pooled output tensor.
-func AvgPool3D(scope *Scope, input tf.Output, ksize []int64, strides []int64, padding string, optional ...AvgPool3DAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "AvgPool3D",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// A placeholder for input pipeline graph optimizations.
-//
-// A placeholder for input pipeline graph optimizations.
-//
-// Arguments:
-//	input_dataset: A variant tensor representing the input dataset.
-func SinkDataset(scope *Scope, input_dataset tf.Output) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SinkDataset",
-		Input: []tf.Input{
-			input_dataset,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Constructs an Optional variant from a tuple of tensors.
 func OptionalFromValue(scope *Scope, components []tf.Output) (optional tf.Output) {
 	if scope.Err() != nil {
diff --git a/tensorflow/java/BUILD b/tensorflow/java/BUILD
index 9dce78b9a36..10808e162ee 100644
--- a/tensorflow/java/BUILD
+++ b/tensorflow/java/BUILD
@@ -381,6 +381,7 @@ tf_cc_binary(
     linkshared = 1,
     linkstatic = 1,
     deps = [
+        "//tensorflow/core/distributed_runtime/rpc:grpc_server_lib",
         "//tensorflow/java/src/main/native",
         LINKER_VERSION_SCRIPT,
         LINKER_EXPORTED_SYMBOLS,
diff --git a/tensorflow/java/maven/libtensorflow/pom.xml b/tensorflow/java/maven/libtensorflow/pom.xml
index c2ece557d5b..db3a3609f1a 100644
--- a/tensorflow/java/maven/libtensorflow/pom.xml
+++ b/tensorflow/java/maven/libtensorflow/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.tensorflow</groupId>
     <artifactId>parentpom</artifactId>
-    <version>1.12.0-rc2</version>
+    <version>1.12.0</version>
     <relativePath>../</relativePath>
   </parent>
   <artifactId>libtensorflow</artifactId>
diff --git a/tensorflow/java/maven/libtensorflow_jni/pom.xml b/tensorflow/java/maven/libtensorflow_jni/pom.xml
index 0d6f46c6fe6..53f7a2d63ef 100644
--- a/tensorflow/java/maven/libtensorflow_jni/pom.xml
+++ b/tensorflow/java/maven/libtensorflow_jni/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.tensorflow</groupId>
     <artifactId>parentpom</artifactId>
-    <version>1.12.0-rc2</version>
+    <version>1.12.0</version>
     <relativePath>../</relativePath>
   </parent>
   <artifactId>libtensorflow_jni</artifactId>
diff --git a/tensorflow/java/maven/libtensorflow_jni_gpu/pom.xml b/tensorflow/java/maven/libtensorflow_jni_gpu/pom.xml
index ab54a61076a..a17724c805e 100644
--- a/tensorflow/java/maven/libtensorflow_jni_gpu/pom.xml
+++ b/tensorflow/java/maven/libtensorflow_jni_gpu/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.tensorflow</groupId>
     <artifactId>parentpom</artifactId>
-    <version>1.12.0-rc2</version>
+    <version>1.12.0</version>
     <relativePath>../</relativePath>
   </parent>
   <artifactId>libtensorflow_jni_gpu</artifactId>
diff --git a/tensorflow/java/maven/pom.xml b/tensorflow/java/maven/pom.xml
index 557a755236f..30831f90b9f 100644
--- a/tensorflow/java/maven/pom.xml
+++ b/tensorflow/java/maven/pom.xml
@@ -6,7 +6,7 @@
   <modelVersion>4.0.0</modelVersion>
   <groupId>org.tensorflow</groupId>
   <artifactId>parentpom</artifactId>
-  <version>1.12.0-rc2</version>
+  <version>1.12.0</version>
   <packaging>pom</packaging>
 
   <url>https://www.tensorflow.org</url>
diff --git a/tensorflow/java/maven/proto/pom.xml b/tensorflow/java/maven/proto/pom.xml
index 2f435a6da0c..dd6b52be624 100644
--- a/tensorflow/java/maven/proto/pom.xml
+++ b/tensorflow/java/maven/proto/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.tensorflow</groupId>
     <artifactId>parentpom</artifactId>
-    <version>1.12.0-rc2</version>
+    <version>1.12.0</version>
     <relativePath>../</relativePath>
   </parent>
   <artifactId>proto</artifactId>
diff --git a/tensorflow/java/maven/spark-tensorflow-connector/pom.xml b/tensorflow/java/maven/spark-tensorflow-connector/pom.xml
index da94c58c42f..f47c11809d5 100644
--- a/tensorflow/java/maven/spark-tensorflow-connector/pom.xml
+++ b/tensorflow/java/maven/spark-tensorflow-connector/pom.xml
@@ -6,7 +6,7 @@
     <groupId>org.tensorflow</groupId>
     <artifactId>spark-tensorflow-connector_2.11</artifactId>
     <packaging>jar</packaging>
-    <version>1.12.0-rc2</version>
+    <version>1.12.0</version>
     <name>spark-tensorflow-connector</name>
     <url>https://www.tensorflow.org</url>
     <description>TensorFlow TFRecord connector for Apache Spark DataFrames</description>
diff --git a/tensorflow/java/maven/tensorflow-hadoop/pom.xml b/tensorflow/java/maven/tensorflow-hadoop/pom.xml
index 73ce7a9ffd8..11aaba983f6 100644
--- a/tensorflow/java/maven/tensorflow-hadoop/pom.xml
+++ b/tensorflow/java/maven/tensorflow-hadoop/pom.xml
@@ -5,7 +5,7 @@
     <groupId>org.tensorflow</groupId>
     <artifactId>tensorflow-hadoop</artifactId>
     <packaging>jar</packaging>
-    <version>1.12.0-rc2</version>
+    <version>1.12.0</version>
     <name>tensorflow-hadoop</name>
     <url>https://www.tensorflow.org</url>
     <description>TensorFlow TFRecord InputFormat/OutputFormat for Apache Hadoop</description>
diff --git a/tensorflow/java/maven/tensorflow/pom.xml b/tensorflow/java/maven/tensorflow/pom.xml
index fa137e94409..07fcfa51446 100644
--- a/tensorflow/java/maven/tensorflow/pom.xml
+++ b/tensorflow/java/maven/tensorflow/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.tensorflow</groupId>
     <artifactId>parentpom</artifactId>
-    <version>1.12.0-rc2</version>
+    <version>1.12.0</version>
     <relativePath>../</relativePath>
   </parent>
   <artifactId>tensorflow</artifactId>
diff --git a/tensorflow/java/src/main/java/org/tensorflow/Server.java b/tensorflow/java/src/main/java/org/tensorflow/Server.java
new file mode 100644
index 00000000000..6adcdba17b3
--- /dev/null
+++ b/tensorflow/java/src/main/java/org/tensorflow/Server.java
@@ -0,0 +1,133 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow;
+
+/**
+ * An in-process TensorFlow server, for use in distributed training.
+ *
+ * <p>A {@code Server} instance encapsulates a set of devices and a {@link org.tensorflow.Session}
+ * target that can participate in distributed training. A server belongs to a cluster (specified by
+ * a {@code ClusterSpec}), and corresponds to a particular task in a named job. The server can
+ * communicate with any other server in the same cluster. The server will not serve any requests
+ * until {@link #start()} is invoked. The server will stop serving requests once {@link #stop()} or
+ * {@link #close()} is invoked. Be aware that {@link #close()} method stops the server if it is
+ * running.
+ *
+ * <p><b>WARNING:</b> A {@code Server} owns resources that <b>must</b> be explicitly freed by
+ * invoking {@link #close()}.
+ *
+ * <p>Instances of a {@code Server} are thread-safe.
+ *
+ * <p>Using example:
+ *
+ * <pre>{@code
+ * import org.tensorflow.Server;
+ * import org.tensorflow.distruntime.ClusterDef;
+ * import org.tensorflow.distruntime.JobDef;
+ * import org.tensorflow.distruntime.ServerDef;
+ *
+ * ClusterDef clusterDef = ClusterDef.newBuilder()
+ *   .addJob(JobDef.newBuilder()
+ *   .setName("worker")
+ *   .putTasks(0, "localhost:4321")
+ *   .build()
+ * ).build();
+ *
+ * ServerDef serverDef = ServerDef.newBuilder()
+ *   .setCluster(clusterDef)
+ *   .setJobName("worker")
+ *   .setTaskIndex(0)
+ *   .setProtocol("grpc")
+ * .build();
+ *
+ * try (Server srv = new Server(serverDef.toByteArray())) {
+ *   srv.start();
+ *   srv.join();
+ * }
+ * }</pre>
+ */
+public final class Server implements AutoCloseable {
+  /**
+   * Constructs a new instance of server.
+   *
+   * @param serverDef Server definition specified as a serialized <a
+   *     href="https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/protobuf/tensorflow_server.proto">ServerDef</a>
+   *     protocol buffer.
+   */
+  public Server(byte[] serverDef) {
+    nativeHandle = allocate(serverDef);
+  }
+
+  /** Starts an in-process TensorFlow server. */
+  public synchronized void start() {
+    start(nativeHandle);
+  }
+
+  /** Stops an in-process TensorFlow server. */
+  public synchronized void stop() {
+    stop(nativeHandle);
+  }
+
+  /** Blocks until the server has been successfully stopped. */
+  public void join() {
+    long handle = 0;
+    synchronized (this) {
+      handle = nativeHandle;
+      if (handle != 0) {
+        numJoining++;
+      }
+    }
+    try {
+      join(handle);
+    } finally {
+      synchronized (this) {
+        if (handle != 0) {
+          numJoining--;
+        }
+        notifyAll();
+      }
+    }
+  }
+
+  /** Destroy an in-process TensorFlow server, frees memory. */
+  @Override
+  public synchronized void close() throws InterruptedException {
+    stop();
+    while (numJoining > 0) {
+      wait();
+    }
+    delete(nativeHandle);
+    nativeHandle = 0;
+  }
+
+  private static native long allocate(byte[] serverDef);
+
+  private static native void start(long nativeHandle);
+
+  private static native void stop(long nativeHandle);
+
+  private static native void join(long nativeHandle);
+
+  private static native void delete(long nativeHandle);
+
+  private long nativeHandle;
+
+  private int numJoining;
+
+  static {
+    TensorFlow.init();
+  }
+}
diff --git a/tensorflow/java/src/main/native/server_jni.cc b/tensorflow/java/src/main/native/server_jni.cc
new file mode 100644
index 00000000000..d9268574045
--- /dev/null
+++ b/tensorflow/java/src/main/native/server_jni.cc
@@ -0,0 +1,104 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/java/src/main/native/server_jni.h"
+#include "tensorflow/c/c_api.h"
+#include "tensorflow/java/src/main/native/exception_jni.h"
+#include "tensorflow/java/src/main/native/utils_jni.h"
+
+namespace {
+
+TF_Server* requireHandle(JNIEnv* env, jlong handle) {
+  static_assert(sizeof(jlong) >= sizeof(TF_Server*),
+                "Cannot package C object pointers as a Java long");
+  if (handle == 0) {
+    throwException(env, kIllegalStateException,
+                   "close() has been called on the Server");
+    return nullptr;
+  }
+
+  return reinterpret_cast<TF_Server*>(handle);
+}
+
+}  // namespace
+
+JNIEXPORT jlong JNICALL Java_org_tensorflow_Server_allocate(
+    JNIEnv* env, jclass clazz, jbyteArray server_def) {
+  TF_Status* status = TF_NewStatus();
+
+  jbyte* server_def_ptr = env->GetByteArrayElements(server_def, nullptr);
+
+  TF_Server* server = TF_NewServer(
+      server_def_ptr, static_cast<size_t>(env->GetArrayLength(server_def)),
+      status);
+
+  env->ReleaseByteArrayElements(server_def, server_def_ptr, JNI_ABORT);
+  bool ok = throwExceptionIfNotOK(env, status);
+
+  TF_DeleteStatus(status);
+
+  return ok ? reinterpret_cast<jlong>(server) : 0;
+}
+
+JNIEXPORT void JNICALL Java_org_tensorflow_Server_start(JNIEnv* env,
+                                                        jclass clazz,
+                                                        jlong handle) {
+  TF_Server* server = requireHandle(env, handle);
+  if (server == nullptr) return;
+
+  TF_Status* status = TF_NewStatus();
+
+  TF_ServerStart(server, status);
+  throwExceptionIfNotOK(env, status);
+
+  TF_DeleteStatus(status);
+}
+
+JNIEXPORT void JNICALL Java_org_tensorflow_Server_stop(JNIEnv* env,
+                                                       jclass clazz,
+                                                       jlong handle) {
+  TF_Server* server = requireHandle(env, handle);
+  if (server == nullptr) return;
+
+  TF_Status* status = TF_NewStatus();
+
+  TF_ServerStop(server, status);
+  throwExceptionIfNotOK(env, status);
+
+  TF_DeleteStatus(status);
+}
+
+JNIEXPORT void JNICALL Java_org_tensorflow_Server_join(JNIEnv* env,
+                                                       jclass clazz,
+                                                       jlong handle) {
+  TF_Server* server = requireHandle(env, handle);
+  if (server == nullptr) return;
+
+  TF_Status* status = TF_NewStatus();
+
+  TF_ServerJoin(server, status);
+  throwExceptionIfNotOK(env, status);
+
+  TF_DeleteStatus(status);
+}
+
+JNIEXPORT void JNICALL Java_org_tensorflow_Server_delete(JNIEnv* env,
+                                                         jclass clazz,
+                                                         jlong handle) {
+  TF_Server* server = requireHandle(env, handle);
+  if (server == nullptr) return;
+
+  TF_DeleteServer(server);
+}
diff --git a/tensorflow/java/src/main/native/server_jni.h b/tensorflow/java/src/main/native/server_jni.h
new file mode 100644
index 00000000000..4bfe90b7a85
--- /dev/null
+++ b/tensorflow/java/src/main/native/server_jni.h
@@ -0,0 +1,66 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_JAVA_SRC_MAIN_NATIVE_SERVER_JNI_H_
+#define TENSORFLOW_JAVA_SRC_MAIN_NATIVE_SERVER_JNI_H_
+
+#include <jni.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Class:     org_tensorflow_Server
+ * Method:    allocate
+ * Signature: ([B)J
+ */
+JNIEXPORT jlong JNICALL
+Java_org_tensorflow_Server_allocate(JNIEnv *, jclass, jbyteArray server_def);
+
+/*
+ * Class:     org_tensorflow_Server
+ * Method:    start
+ * Signature: (J)V
+ */
+JNIEXPORT void JNICALL Java_org_tensorflow_Server_start(JNIEnv *, jclass,
+                                                        jlong);
+
+/*
+ * Class:     org_tensorflow_Server
+ * Method:    stop
+ * Signature: (J)V
+ */
+JNIEXPORT void JNICALL Java_org_tensorflow_Server_stop(JNIEnv *, jclass, jlong);
+
+/*
+ * Class:     org_tensorflow_Session
+ * Method:    join
+ * Signature: (J)V
+ */
+JNIEXPORT void JNICALL Java_org_tensorflow_Server_join(JNIEnv *, jclass, jlong);
+
+/*
+ * Class:     org_tensorflow_Session
+ * Method:    delete
+ * Signature: (J)V
+ */
+JNIEXPORT void JNICALL Java_org_tensorflow_Server_delete(JNIEnv *, jclass,
+                                                         jlong);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+#endif  // TENSORFLOW_JAVA_SRC_MAIN_NATIVE_SERVER_JNI_H_
diff --git a/tensorflow/lite/BUILD b/tensorflow/lite/BUILD
index f8bb7191c4e..be84fc5db1f 100644
--- a/tensorflow/lite/BUILD
+++ b/tensorflow/lite/BUILD
@@ -27,11 +27,11 @@ config_setting(
     },
 )
 
-# Enables inclusion of TensorFlow kernels via the TF Lite Flex delegate.
+# Enables inclusion of select TensorFlow kernels via the TF Lite Flex delegate.
 # WARNING: This build flag is experimental and subject to change.
 config_setting(
-    name = "with_tflite_flex",
-    define_values = {"with_tflite_flex": "true"},
+    name = "with_select_tf_ops",
+    define_values = {"with_select_tf_ops": "true"},
     visibility = ["//visibility:public"],
 )
 
@@ -190,7 +190,7 @@ cc_library(
         "//tensorflow/lite/profiling:profiler",
         "//tensorflow/lite/schema:schema_fbs",
     ] + select({
-        ":with_tflite_flex": [
+        ":with_select_tf_ops": [
             "//tensorflow/lite/delegates/flex:delegate",
         ],
         "//conditions:default": [],
@@ -201,6 +201,7 @@ cc_library(
     name = "string_util",
     srcs = ["string_util.cc"],
     hdrs = ["string_util.h"],
+    copts = tflite_copts(),
     deps = [
         ":framework",
         ":string",
@@ -323,6 +324,7 @@ cc_library(
     name = "util",
     srcs = ["util.cc"],
     hdrs = ["util.h"],
+    copts = tflite_copts(),
     deps = [
         "//tensorflow/lite/c:c_api_internal",
     ],
diff --git a/tensorflow/lite/build_def.bzl b/tensorflow/lite/build_def.bzl
index 3b0af52fb93..bc98dc57bc5 100644
--- a/tensorflow/lite/build_def.bzl
+++ b/tensorflow/lite/build_def.bzl
@@ -29,8 +29,11 @@ def tflite_copts():
         ],
         str(Label("//tensorflow:windows")): [
             "/DTF_COMPILE_LIBRARY",
+            "/wd4018",  # -Wno-sign-compare
+        ],
+        "//conditions:default": [
+            "-Wno-sign-compare",
         ],
-        "//conditions:default": [],
     }) + select({
         str(Label("//tensorflow:with_default_optimizations")): [],
         "//conditions:default": ["-DGEMMLOWP_ALLOW_SLOW_SCALAR_FALLBACK"],
@@ -417,7 +420,13 @@ def gen_selected_ops(name, model):
         tools = [tool],
     )
 
-def gen_model_coverage_test(model_name, data, failure_type):
+def flex_dep(target_op_sets):
+    if "SELECT_TF_OPS" in target_op_sets:
+        return ["//tensorflow/lite/delegates/flex:delegate"]
+    else:
+        return []
+
+def gen_model_coverage_test(model_name, data, failure_type, tags):
     """Generates Python test targets for testing TFLite models.
 
     Args:
@@ -427,6 +436,7 @@ def gen_model_coverage_test(model_name, data, failure_type):
       failure_type: List of failure types (none, toco, crash, inference)
         expected for the corresponding combinations of op sets
         ("TFLITE_BUILTINS", "TFLITE_BUILTINS,SELECT_TF_OPS", "SELECT_TF_OPS").
+      tags: List of strings of additional tags.
     """
     i = 0
     for target_op_sets in ["TFLITE_BUILTINS", "TFLITE_BUILTINS,SELECT_TF_OPS", "SELECT_TF_OPS"]:
@@ -448,10 +458,10 @@ def gen_model_coverage_test(model_name, data, failure_type):
                 "no_oss",
                 "no_windows",
                 "notap",
-            ],
+            ] + tags,
             deps = [
                 "//tensorflow/lite/testing/model_coverage:model_coverage_lib",
                 "//tensorflow/lite/python:lite",
                 "//tensorflow/python:client_testlib",
-            ],
+            ] + flex_dep(target_op_sets),
         )
diff --git a/tensorflow/lite/c/c_api_internal.h b/tensorflow/lite/c/c_api_internal.h
index cbc69f804be..e05fd19936e 100644
--- a/tensorflow/lite/c/c_api_internal.h
+++ b/tensorflow/lite/c/c_api_internal.h
@@ -373,7 +373,7 @@ typedef struct TfLiteContext {
 
   // Replace ops with one or more stub delegate operations. This function
   // does not take ownership of `nodes_to_replace`.
-  TfLiteStatus (*ReplaceSubgraphsWithDelegateKernels)(
+  TfLiteStatus (*ReplaceNodeSubsetsWithDelegateKernels)(
       struct TfLiteContext*, TfLiteRegistration registration,
       const TfLiteIntArray* nodes_to_replace, TfLiteDelegate* delegate);
 
@@ -481,7 +481,7 @@ typedef struct _TfLiteDelegate {
 
   // Invoked by ModifyGraphWithDelegate. This prepare is called, giving the
   // delegate a view of the current graph through TfLiteContext*. It typically
-  // will look at the nodes and call ReplaceSubgraphsWithDelegateKernels()
+  // will look at the nodes and call ReplaceNodeSubsetsWithDelegateKernels()
   // to ask the TensorFlow lite runtime to create macro-nodes to represent
   // delegated subgraphs of the original graph.
   TfLiteStatus (*Prepare)(TfLiteContext* context, TfLiteDelegate* delegate);
diff --git a/tensorflow/lite/delegates/flex/BUILD b/tensorflow/lite/delegates/flex/BUILD
index f9850112abd..222a043a88e 100644
--- a/tensorflow/lite/delegates/flex/BUILD
+++ b/tensorflow/lite/delegates/flex/BUILD
@@ -53,7 +53,9 @@ cc_library(
     ],
     visibility = ["//visibility:public"],
     deps = [
+        ":delegate_data",
         ":delegate_only_runtime",
+        "//tensorflow/lite/c:c_api_internal",
     ] + select({
         "//tensorflow:android": [
             "//tensorflow/core:android_tensorflow_lib",
diff --git a/tensorflow/lite/delegates/flex/buffer_map.cc b/tensorflow/lite/delegates/flex/buffer_map.cc
index 2c4aa7075db..9a6c5e74a7b 100644
--- a/tensorflow/lite/delegates/flex/buffer_map.cc
+++ b/tensorflow/lite/delegates/flex/buffer_map.cc
@@ -130,6 +130,10 @@ bool BufferMap::HasTensor(int tensor_index) const {
   return id_to_tensor_.count(tensor_index) != 0;
 }
 
+bool BufferMap::IsTensorFlowTensor(int tensor_index) const {
+  return HasTensor(tensor_index) && owned_by_tf_.count(tensor_index) > 0;
+}
+
 tensorflow::Tensor BufferMap::GetTensor(int tensor_index) const {
   return id_to_tensor_.at(tensor_index);
 }
@@ -154,11 +158,13 @@ void BufferMap::SetFromTfLite(int tensor_index, const TfLiteTensor* tensor) {
       GetTensorFlowDataType(tensor->type), shape, buf);
   buf->Unref();
 
-  SetFromTensorFlow(tensor_index, std::move(t));
+  id_to_tensor_[tensor_index] = std::move(t);
+  owned_by_tf_.erase(tensor_index);
 }
 
 void BufferMap::SetFromTensorFlow(int tensor_index, tensorflow::Tensor tensor) {
   id_to_tensor_[tensor_index] = std::move(tensor);
+  owned_by_tf_.insert(tensor_index);
 }
 
 }  // namespace flex
diff --git a/tensorflow/lite/delegates/flex/buffer_map.h b/tensorflow/lite/delegates/flex/buffer_map.h
index 269a0a2a276..b73ed88d378 100644
--- a/tensorflow/lite/delegates/flex/buffer_map.h
+++ b/tensorflow/lite/delegates/flex/buffer_map.h
@@ -38,12 +38,17 @@ class BufferMap {
   // tensorflow::Tensor.
   bool HasTensor(int tensor_index) const;
 
+  // Returns true if the given 'tensor_index' has a corresponding
+  // tensorflow::Tensor *and* the content is owned by TensorFlow (that is, the
+  // mapping was added by SetFromTensorFlow()).
+  bool IsTensorFlowTensor(int tensor_index) const;
+
   // Returns the tensorflow::Tensor associated with the given 'tensor_index'.
   // Precondition: HasTensor() is true.
   tensorflow::Tensor GetTensor(int tensor_index) const;
 
   // Associates the given tensorflow::Tensor with the given 'tensor_index'.
-  // Note that tensorflow Tensors share data buffers, so this method is only a
+  // Note that TensorFlow Tensors share data buffers, so this method is only a
   // shallow copy.
   void SetFromTensorFlow(int tensor_index, tensorflow::Tensor tensor);
 
@@ -52,7 +57,17 @@ class BufferMap {
   void SetFromTfLite(int tensor_index, const TfLiteTensor* tensor);
 
  private:
+  // Mapping from TL Lite tensor ID to TensorFlow's Tensor. All tensors that
+  // are inputs or outputs of a subgraph will be added here, irrespective of
+  // whether their data are managed by TF Lite or TensorFlow.
   std::map<int, tensorflow::Tensor> id_to_tensor_;
+  // A list of tensors that are completely managed by TensorFlow. Most of the
+  // time, TF Lite will populate tensors that are inputs to subgraphs, while
+  // TensorFlow will populate output tensors. Occasionally, however, an input
+  // tensor is coming from a previous subgraph and could have been populated by
+  // TensorFlow. This set keeps track of all input or output tensors that have
+  // been populated by tensorflow.
+  std::set<int> owned_by_tf_;
 };
 
 }  // namespace flex
diff --git a/tensorflow/lite/delegates/flex/buffer_map_test.cc b/tensorflow/lite/delegates/flex/buffer_map_test.cc
index fd52273fb40..9e8472f1e7d 100644
--- a/tensorflow/lite/delegates/flex/buffer_map_test.cc
+++ b/tensorflow/lite/delegates/flex/buffer_map_test.cc
@@ -203,6 +203,7 @@ TEST(BufferMapTest, TfLiteOverwritesTensorFlow) {
   buffer_map.SetFromTensorFlow(0, t1);
   buffer_map.SetFromTfLite(0, t2.get());
 
+  EXPECT_FALSE(buffer_map.IsTensorFlowTensor(0));
   EXPECT_THAT(GetTensorData<int>(buffer_map.GetTensor(0)),
               ElementsAre(0, 0, 0, 3, 0, 0, 1, 2));
 }
@@ -216,6 +217,7 @@ TEST(BufferMapTest, TensorFlowOverwritesTfLite) {
   buffer_map.SetFromTfLite(0, t2.get());
   buffer_map.SetFromTensorFlow(0, t1);
 
+  EXPECT_TRUE(buffer_map.IsTensorFlowTensor(0));
   EXPECT_THAT(GetTensorData<float>(buffer_map.GetTensor(0)),
               ElementsAre(0, 0, 0, 0.123f, 0, 0));
 }
diff --git a/tensorflow/lite/delegates/flex/delegate.cc b/tensorflow/lite/delegates/flex/delegate.cc
index e7433f0c478..4fc2d82b494 100644
--- a/tensorflow/lite/delegates/flex/delegate.cc
+++ b/tensorflow/lite/delegates/flex/delegate.cc
@@ -46,11 +46,11 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteDelegate* delegate) {
   }
 
   // Request TFLite to partition the graph and make kernels for each independent
-  // subgraph.
+  // node sub set.
   TfLiteIntArray* size_and_nodes =
       ConvertVectorToTfLiteIntArray(supported_nodes);
-  context->ReplaceSubgraphsWithDelegateKernels(context, GetKernel(),
-                                               size_and_nodes, delegate);
+  context->ReplaceNodeSubsetsWithDelegateKernels(context, GetKernel(),
+                                                 size_and_nodes, delegate);
   TfLiteIntArrayFree(size_and_nodes);
   return kTfLiteOk;
 }
diff --git a/tensorflow/lite/delegates/flex/kernel.cc b/tensorflow/lite/delegates/flex/kernel.cc
index 2c19580235f..c4fe142dff1 100644
--- a/tensorflow/lite/delegates/flex/kernel.cc
+++ b/tensorflow/lite/delegates/flex/kernel.cc
@@ -251,7 +251,12 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   for (auto tensor_index : op_data->subgraph_inputs) {
     TfLiteTensor* tensor = &context->tensors[tensor_index];
     if (!IsConstantTensor(tensor)) {
-      buffer_map->SetFromTfLite(tensor_index, tensor);
+      // If this tensor is part of an earlier TF subgraph we should not add it
+      // to the BufferMap again, because TF already knows about it and its
+      // contents are kept automatically up-to-date.
+      if (!buffer_map->IsTensorFlowTensor(tensor_index)) {
+        buffer_map->SetFromTfLite(tensor_index, tensor);
+      }
     }
   }
 
diff --git a/tensorflow/lite/delegates/flex/kernel_test.cc b/tensorflow/lite/delegates/flex/kernel_test.cc
index fae2b8b19e8..f55759594df 100644
--- a/tensorflow/lite/delegates/flex/kernel_test.cc
+++ b/tensorflow/lite/delegates/flex/kernel_test.cc
@@ -30,7 +30,7 @@ TfLiteStatus GenericPrepare(TfLiteContext* context, TfLiteDelegate* delegate,
                             const std::vector<int>& supported_nodes) {
   TfLiteIntArray* size_and_nodes =
       ConvertVectorToTfLiteIntArray(supported_nodes);
-  TF_LITE_ENSURE_STATUS(context->ReplaceSubgraphsWithDelegateKernels(
+  TF_LITE_ENSURE_STATUS(context->ReplaceNodeSubsetsWithDelegateKernels(
       context, flex::GetKernel(), size_and_nodes, delegate));
   TfLiteIntArrayFree(size_and_nodes);
   return kTfLiteOk;
@@ -100,6 +100,17 @@ TEST_F(KernelTest, FullGraph) {
 
   ASSERT_THAT(GetShape(8), ElementsAre(2, 1));
   ASSERT_THAT(GetValues(8), ElementsAre(14.52f, 38.72f));
+
+  // Try again with different inputs
+  SetShape(0, {2, 3, 1});
+  SetValues(0, {2.0f, 2.0f, 3.0f, 3.0f, 4.0f, 4.0f});
+  SetShape(3, {2, 3, 1});
+  SetValues(3, {2.0f, 2.0f, 3.0f, 3.0f, 4.0f, 4.0f});
+
+  ASSERT_TRUE(Invoke());
+
+  ASSERT_THAT(GetShape(8), ElementsAre(3, 1));
+  ASSERT_THAT(GetValues(8), ElementsAre(24.0f, 32.0f, 48.0f));
 }
 
 TEST_F(KernelTest, BadTensorFlowOp) {
@@ -240,11 +251,23 @@ TEST_F(KernelTest, SplitGraph) {
   ASSERT_TRUE(Invoke());
 
   ASSERT_THAT(GetShape(17), ElementsAre(1));
+  ASSERT_THAT(GetValues(17), ElementsAre(16.0f));
 
-  // It should really be 16, but we are messing up tensor #16 with
-  // data from the TF Lite buffer, even though that particular tensor
-  // should use the data produced by TF.
-  ASSERT_THAT(GetValues(17), ElementsAre(::testing::Not(16.0f)));
+  // Same as above but with slightly different output.
+  // We still expect the result to be l + r where
+  //     l = (a0 + b0) * (a2 + b2) + (a1 + b1) * (a3 + b3)
+  //     r = (a4 + a6) + (a5 + a7)
+  SetShape(0, {2, 2, 2, 1});
+  SetValues(0, {4.0f, 1.0f, 1.5f, -2.0f, 2.0f, 0.0f, -2.0f, 3.0f});
+  SetShape(1, {2, 2, 1});
+  SetValues(1, {0.0f, 2.0f, 1.5f, 3.0f});
+  // So l = (4 + 0) * (1.5 + 1.5) + (1 + 2) * (-2 + 3) =  12 + 3 = 15
+  //    r = (2 - 2) + (0 + 3) = 3
+
+  ASSERT_TRUE(Invoke());
+
+  ASSERT_THAT(GetShape(17), ElementsAre(1));
+  ASSERT_THAT(GetValues(17), ElementsAre(18.0f));
 }
 
 }  // namespace
diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc b/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
index 74aec27f82a..9690c659211 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
@@ -374,7 +374,7 @@ struct NNAPIOpMappingArgs {
   std::vector<int>* model_state_tfl_inputs;
 };
 
-// The kernel that represents the subgraph of TF Lite being run on NN API.
+// The kernel that represents the node sub set of TF Lite being run on NN API.
 class NNAPIDelegateKernel {
  public:
   NNAPIDelegateKernel() = default;
@@ -1174,7 +1174,7 @@ TfLiteDelegate* NnApiDelegate() {
         supported_nodes[0] = supported_nodes.size() - 1;
 
         // NN API Delegate Registration (the pseudo kernel that will invoke NN
-        // API subgraphs)
+        // API node sub sets)
         static const TfLiteRegistration nnapi_delegate_kernel = {
             .init = [](TfLiteContext* context, const char* buffer,
                        size_t length) -> void* {
@@ -1207,8 +1207,8 @@ TfLiteDelegate* NnApiDelegate() {
         };
 
         // Request TFLite to partition the graph and make kernels
-        // for each independent subgraph a new nnapi_delegate_kernel.
-        context->ReplaceSubgraphsWithDelegateKernels(
+        // for each independent node sub set a new nnapi_delegate_kernel.
+        context->ReplaceNodeSubsetsWithDelegateKernels(
             context, nnapi_delegate_kernel,
             reinterpret_cast<TfLiteIntArray*>(supported_nodes.data()),
             delegate);
diff --git a/tensorflow/lite/examples/android/build.gradle b/tensorflow/lite/examples/android/build.gradle
index 66a62a921a7..74dacbcddbd 100644
--- a/tensorflow/lite/examples/android/build.gradle
+++ b/tensorflow/lite/examples/android/build.gradle
@@ -22,3 +22,7 @@ allprojects {
 task clean(type: Delete) {
     delete rootProject.buildDir
 }
+
+// Changed since default name 'build' conflicts with
+// bazel BUILD file name.
+buildDir = "gradle-build"
diff --git a/tensorflow/lite/experimental/c/BUILD b/tensorflow/lite/experimental/c/BUILD
index 5dd62194dea..cde53e28383 100644
--- a/tensorflow/lite/experimental/c/BUILD
+++ b/tensorflow/lite/experimental/c/BUILD
@@ -58,6 +58,7 @@ cc_library(
     srcs = ["c_api.cc"],
     hdrs = ["c_api.h"],
     copts = tflite_copts(),
+    tags = ["swift_module=TensorFlowLiteCAPI"],
     visibility = [
         ":experimental",
     ],
diff --git a/tensorflow/lite/experimental/examples/lstm/BUILD b/tensorflow/lite/experimental/examples/lstm/BUILD
index 7a475a24d36..0c351ee4ecc 100644
--- a/tensorflow/lite/experimental/examples/lstm/BUILD
+++ b/tensorflow/lite/experimental/examples/lstm/BUILD
@@ -23,10 +23,8 @@ py_test(
     srcs = ["unidirectional_sequence_lstm_test.py"],
     srcs_version = "PY2AND3",
     tags = [
-        "manual",
         "no_oss",
         "no_pip",
-        "notap",
     ],
     deps = [
         ":tflite_lstm",
diff --git a/tensorflow/lite/experimental/examples/lstm/unidirectional_sequence_lstm_test.py b/tensorflow/lite/experimental/examples/lstm/unidirectional_sequence_lstm_test.py
index 81ab6691df7..eeb48d12311 100644
--- a/tensorflow/lite/experimental/examples/lstm/unidirectional_sequence_lstm_test.py
+++ b/tensorflow/lite/experimental/examples/lstm/unidirectional_sequence_lstm_test.py
@@ -19,8 +19,9 @@ import tempfile
 import numpy as np
 import tensorflow as tf
 
-from tensorflow.lite.experimental.examples.lstm.tflite_lstm import TFLiteLSTMCell
 from tensorflow.examples.tutorials.mnist import input_data
+from tensorflow.lite.experimental.examples.lstm.tflite_lstm import TFLiteLSTMCell
+from tensorflow.lite.python.op_hint import convert_op_hints_to_stubs
 from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 from tensorflow.python.tools import optimize_for_inference_lib
@@ -50,17 +51,17 @@ class UnidirectionalSequenceLstmTest(test_util.TensorFlowTestCase):
     # Batch size
     self.batch_size = 16
     # Lstm Units.
-    self.num_units = 64
+    self.num_units = 16
 
   def buildLstmLayer(self):
     return tf.nn.rnn_cell.MultiRNNCell([
         TFLiteLSTMCell(
             self.num_units, use_peepholes=True, forget_bias=0, name="rnn1"),
-        TFLiteLSTMCell(self.num_units, num_proj=64, forget_bias=0, name="rnn2"),
+        TFLiteLSTMCell(self.num_units, num_proj=8, forget_bias=0, name="rnn2"),
         TFLiteLSTMCell(
             self.num_units // 2,
             use_peepholes=True,
-            num_proj=64,
+            num_proj=8,
             forget_bias=0,
             name="rnn3"),
         TFLiteLSTMCell(self.num_units, forget_bias=0, name="rnn4")
@@ -150,7 +151,7 @@ class UnidirectionalSequenceLstmTest(test_util.TensorFlowTestCase):
     tf.import_graph_def(graph, name="", input_map={"INPUT_IMAGE": tflite_input})
     with tf.Session() as sess:
       curr = sess.graph_def
-      curr = tf.lite.convert_op_hints_to_stubs(graph_def=curr)
+      curr = convert_op_hints_to_stubs(graph_def=curr)
 
     curr = optimize_for_inference_lib.optimize_for_inference(
         curr, ["INPUT_IMAGE_LITE"], ["OUTPUT_CLASS"],
@@ -189,7 +190,7 @@ class UnidirectionalSequenceLstmTest(test_util.TensorFlowTestCase):
         x, output_class, new_sess)
 
     result = self.tfliteInvoke(frozen_graph, test_inputs, output_class)
-    self.assertTrue(np.allclose(expected_output, result, rtol=1e-6, atol=1e-3))
+    self.assertTrue(np.allclose(expected_output, result, rtol=1e-6, atol=1e-2))
 
   def testDynamicRnnMultiRnnCell(self):
     sess = tf.Session(config=CONFIG)
@@ -219,7 +220,7 @@ class UnidirectionalSequenceLstmTest(test_util.TensorFlowTestCase):
         x, output_class, new_sess)
 
     result = self.tfliteInvoke(frozen_graph, test_inputs, output_class)
-    self.assertTrue(np.allclose(expected_output, result, rtol=1e-6, atol=1e-3))
+    self.assertTrue(np.allclose(expected_output, result, rtol=1e-6, atol=1e-2))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/BUILD b/tensorflow/lite/experimental/micro/examples/micro_speech/BUILD
index 69022b611ed..07fb8764113 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/BUILD
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/BUILD
@@ -32,7 +32,7 @@ tflite_micro_cc_test(
 )
 
 tflite_micro_cc_test(
-    name = "preprocessor_test",
+    name = "preprocessor_reference_test",
     srcs = [
         "no_30ms_sample_data.cc",
         "no_30ms_sample_data.h",
@@ -52,3 +52,25 @@ tflite_micro_cc_test(
         "//tensorflow/lite/experimental/micro/testing:micro_test",
     ],
 )
+
+tflite_micro_cc_test(
+    name = "preprocessor_fixed_test",
+    srcs = [
+        "fixed_point/preprocessor.cc",
+        "no_30ms_sample_data.cc",
+        "no_30ms_sample_data.h",
+        "no_power_spectrum_data.cc",
+        "no_power_spectrum_data.h",
+        "preprocessor.h",
+        "preprocessor_test.cc",
+        "yes_30ms_sample_data.cc",
+        "yes_30ms_sample_data.h",
+        "yes_power_spectrum_data.cc",
+        "yes_power_spectrum_data.h",
+    ],
+    deps = [
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/experimental/micro:micro_framework",
+        "//tensorflow/lite/experimental/micro/testing:micro_test",
+    ],
+)
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/fixed_point/preprocessor.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/fixed_point/preprocessor.cc
new file mode 100644
index 00000000000..de60c982f3a
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/fixed_point/preprocessor.cc
@@ -0,0 +1,218 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Reference implementation of the preprocessing pipeline, with the same
+// results as the audio tutorial at
+// https://www.tensorflow.org/tutorials/sequences/audio_recognition
+// This module takes 30ms of PCM-encoded signed 16-bit audio samples (at 16KHz,
+// so 480 values), and extracts a power spectrum of frequencies. There are 43
+// frequency bands in the result, derived from the original 256 output from the
+// discrete Fourier transform, and averaged together in groups of 6.
+// It's expected that most platforms will have optimized versions of the
+// functions used here, for example replacing the DFT with an FFT, so this
+// version shouldn't be used where performance is critical.
+// This implementation uses fixed point for any non-constant calculations,
+// instead of floating point, to help show how this can work on platforms that
+// don't have good float support.
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor.h"
+
+#include <cmath>
+
+namespace {
+
+// q format notation: qx.y => 1 sign bit, x-1 integer bits, y fraction bits.
+// Use standard (non-saturating) arithmetic with signed ints of size x+y bits.
+// Sacrifice some precision to avoid use of 64-bit ints.
+
+// q1.15 * q1.15 => q2.30
+inline int32_t Q1_15_FixedMultiply_Q2_30(int16_t a, int16_t b) {
+  int32_t big_a = a;
+  int32_t big_b = b;
+  return big_a * big_b;
+}
+
+// q2.30 * q2.30 => q10.22
+inline int32_t Q2_30_FixedMultiply_Q10_22(int32_t a, int32_t b) {
+  // q2.30 result
+  int32_t tmp = (a >> 15) * (b >> 15);
+  // q10.22 result
+  return tmp >> 8;
+}
+
+// q10.22 * q10.22 => q10.22
+// Will overflow if product is >= 512.
+// Largest product in small test set is 465.25
+inline int32_t Q10_22_FixedMultiply_Q10_22(int32_t a, int32_t b) {
+  // q10.22 result
+  return (a >> 11) * (b >> 11);
+}
+
+// float => q2.30
+// No checking for saturation.  Only used for inputs in range [-1, 1].
+inline int32_t FloatToFixed_Q2_30(float input) {
+  return static_cast<int32_t>(roundf(input * (1 << 30)));
+}
+
+// These constants allow us to allocate fixed-sized arrays on the stack for our
+// working memory.
+constexpr int kInputSize = 512;
+constexpr int kAverageWindowSize = 6;
+constexpr int kOutputSize =
+    ((kInputSize / 2) + (kAverageWindowSize - 1)) / kAverageWindowSize;
+
+// Performs a discrete Fourier transform on the real inputs. This corresponds to
+// rdft() in the FFT package at http://www.kurims.kyoto-u.ac.jp/~ooura/fft.html,
+// and to kiss_fftr() in KISSFFT at https://github.com/mborgerding/kissfft.
+// It takes in an array of float real values, and returns a result of the same
+// length with q10.22 fixed point real and imaginary components interleaved, so
+// fourier_output[0] is the first real value, fourier_output[1] is the first
+// imaginary, fourier_output[2] is the second real, and so on.
+// The calling function should ensure that the array passed in as fourier_output
+// is at least time_series_size in length. Most optimized FFT implementations
+// require the length to be a power of two as well, but this version doesn't
+// enforce that.
+
+// input: q2.30 fixed point.  output: q10.22 fixed point.
+// Outputs interpreted as q10.22 fixed point are un-scaled.
+void CalculateDiscreteFourierTransform(int32_t* time_series,
+                                       int time_series_size,
+                                       int32_t* fourier_output) {
+  for (int i = 0; i < time_series_size / 2; ++i) {
+    int32_t real = 0;
+    for (int j = 0; j < time_series_size; ++j) {
+      const int32_t real_scale =
+          FloatToFixed_Q2_30(cos(j * i * M_PI * 2 / time_series_size));
+      real += Q2_30_FixedMultiply_Q10_22(time_series[j], real_scale);
+    }
+    int32_t imaginary = 0;
+    for (int j = 0; j < time_series_size; ++j) {
+      const int32_t imaginary_scale =
+          FloatToFixed_Q2_30(sin(j * i * M_PI * 2 / time_series_size));
+      imaginary -= Q2_30_FixedMultiply_Q10_22(time_series[j], imaginary_scale);
+    }
+    fourier_output[(i * 2) + 0] = real;
+    fourier_output[(i * 2) + 1] = imaginary;
+  }
+}
+
+// Produces a simple sine curve that is used to ensure frequencies at the center
+// of the current sample window are weighted more heavily than those at the end.
+// q1.15 output format.
+void CalculatePeriodicHann(int window_length, int16_t* window_function) {
+  for (int i = 0; i < window_length; ++i) {
+    const float real_value = (0.5 - 0.5 * cos((2 * M_PI * i) / window_length));
+    int tmp = static_cast<int32_t>(roundf(real_value * (1 << 15)));
+    // Saturate the 0x8000 value to 0x7fff
+    if (tmp > 0x7fff) tmp = 0x7fff;
+    window_function[i] = tmp;
+  }
+}
+
+}  // namespace
+
+TfLiteStatus Preprocess(tflite::ErrorReporter* error_reporter,
+                        const int16_t* input, int input_size, int output_size,
+                        uint8_t* output) {
+  // Ensure our input and output data arrays are valid.
+  if (input_size > kInputSize) {
+    error_reporter->Report("Input size %d larger than %d", input_size,
+                           kInputSize);
+    return kTfLiteError;
+  }
+  if (output_size != kOutputSize) {
+    error_reporter->Report("Requested output size %d doesn't match %d",
+                           output_size, kOutputSize);
+    return kTfLiteError;
+  }
+
+  // Pre-calculate the window function we'll be applying to the input data.
+  // In a real application, we'd calculate this table once in an initialization
+  // function and store it for repeated reuse.
+  // q1.15 format.
+  int16_t window_function[kInputSize];
+  CalculatePeriodicHann(input_size, window_function);
+
+  // Apply the window function to our time series input, and pad it with zeroes
+  // to the next power of two.
+  int32_t fixed_input[kInputSize];
+  for (int i = 0; i < kInputSize; ++i) {
+    if (i < input_size) {
+      // input is int16_t.  Treat as q1.15 fixed point value in range [-1,1)
+      // window_function is also q1.15 fixed point number
+      fixed_input[i] =
+          Q1_15_FixedMultiply_Q2_30(input[i], window_function[i]);
+    } else {
+      fixed_input[i] = 0;
+    }
+  }
+
+  // Pull the frequency data from the time series sample.
+  // Calculated in q10.22 format from q2.30 inputs.
+  int32_t fourier_values[kInputSize];
+  CalculateDiscreteFourierTransform(fixed_input, kInputSize, fourier_values);
+
+  // We have the complex numbers giving us information about each frequency
+  // band, but all we want to know is how strong each frequency is, so calculate
+  // the squared magnitude by adding together the squares of each component.
+  int32_t power_spectrum[kInputSize / 2];
+  for (int i = 0; i < (kInputSize / 2); ++i) {
+    const int32_t real = fourier_values[(i * 2) + 0];
+    const int32_t imaginary = fourier_values[(i * 2) + 1];
+    // q10.22 results
+    power_spectrum[i] =
+        Q10_22_FixedMultiply_Q10_22(real, real) +
+        Q10_22_FixedMultiply_Q10_22(imaginary, imaginary);
+  }
+
+  // Finally, reduce the size of the output by averaging together six adjacent
+  // frequencies into each slot, producing an array of 43 values.
+  // Power_spectrum numbers are q10.22.  Divide by kAverageWindowSize inside
+  // loop to prevent overflow.
+  for (int i = 0; i < kOutputSize; ++i) {
+    int32_t average = 0;
+    for (int j = 0; j < kAverageWindowSize; ++j) {
+      const int index = (i * kAverageWindowSize) + j;
+      if (index < (kInputSize / 2)) {
+        average += power_spectrum[index] / kAverageWindowSize;
+      }
+    }
+    // Quantize the result into eight bits, effectively multiplying by two.
+    // The 127.5 constant here has to match the features_max value defined in
+    // tensorflow/examples/speech_commands/input_data.py, and this also assumes
+    // that features_min is zero.
+    //
+    // q10.22 input
+    // integer output
+    //
+    // output = (input - features_min) *
+    //     (output_max - output_min) / (features_max - features_min)
+    // == (input) * (255) / (127.5)
+    // == input * 2
+    // == input << 1
+    // Also want to round to nearest integer and only keep integer bits
+    // => ((input << 1) + 0x200000) >> 22
+    // == (input + 0x100000) >> 21
+    int32_t quantized_average = (average + 0x100000) >> 21;
+    if (quantized_average < 0) {
+      quantized_average = 0;
+    }
+    if (quantized_average > 255) {
+      quantized_average = 255;
+    }
+    output[i] = quantized_average;
+  }
+  return kTfLiteOk;
+}
diff --git a/tensorflow/lite/experimental/micro/kernels/test_utils.h b/tensorflow/lite/experimental/micro/kernels/test_utils.h
index 4207c609812..95f2d8a9d21 100644
--- a/tensorflow/lite/experimental/micro/kernels/test_utils.h
+++ b/tensorflow/lite/experimental/micro/kernels/test_utils.h
@@ -89,7 +89,7 @@ inline void PopulateContext(TfLiteTensor* tensors, int tensors_size,
   context->ReportError = ReportOpError;
   context->AddTensors = nullptr;
   context->GetNodeAndRegistration = nullptr;
-  context->ReplaceSubgraphsWithDelegateKernels = nullptr;
+  context->ReplaceNodeSubsetsWithDelegateKernels = nullptr;
   context->recommended_num_threads = 1;
   context->GetExternalContext = nullptr;
   context->SetExternalContext = nullptr;
diff --git a/tensorflow/lite/experimental/micro/micro_interpreter.cc b/tensorflow/lite/experimental/micro/micro_interpreter.cc
index e0460c5d3e5..f1c236fb62f 100644
--- a/tensorflow/lite/experimental/micro/micro_interpreter.cc
+++ b/tensorflow/lite/experimental/micro/micro_interpreter.cc
@@ -149,7 +149,7 @@ MicroInterpreter::MicroInterpreter(const Model* model,
   context_.ReportError = ReportOpError;
   context_.AddTensors = nullptr;
   context_.GetNodeAndRegistration = nullptr;
-  context_.ReplaceSubgraphsWithDelegateKernels = nullptr;
+  context_.ReplaceNodeSubsetsWithDelegateKernels = nullptr;
   context_.recommended_num_threads = 1;
   context_.GetExternalContext = nullptr;
   context_.SetExternalContext = nullptr;
diff --git a/tensorflow/lite/experimental/micro/tools/make/Makefile b/tensorflow/lite/experimental/micro/tools/make/Makefile
index 4dbfc629a23..463709a6e5c 100644
--- a/tensorflow/lite/experimental/micro/tools/make/Makefile
+++ b/tensorflow/lite/experimental/micro/tools/make/Makefile
@@ -62,12 +62,19 @@ tensorflow/lite/experimental/micro/examples/micro_speech/yes_features_data.cc
 # Test binary for the microcontroller speech model.
 PREPROCESSOR_TEST_SRCS := \
 tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor_test.cc \
-tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor.cc \
 tensorflow/lite/experimental/micro/examples/micro_speech/no_30ms_sample_data.cc \
 tensorflow/lite/experimental/micro/examples/micro_speech/yes_30ms_sample_data.cc \
 tensorflow/lite/experimental/micro/examples/micro_speech/no_power_spectrum_data.cc \
 tensorflow/lite/experimental/micro/examples/micro_speech/yes_power_spectrum_data.cc
 
+PREPROCESSOR_REFERENCE_TEST_SRCS = \
+$(PREPROCESSOR_TEST_SRCS) \
+tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor.cc
+
+PREPROCESSOR_FIXED_TEST_SRCS += \
+$(PREPROCESSOR_TEST_SRCS) \
+tensorflow/lite/experimental/micro/examples/micro_speech/fixed_point/preprocessor.cc
+
 MICROLITE_TEST_SRCS := \
 $(wildcard tensorflow/lite/experimental/micro/*test.cc) \
 $(wildcard tensorflow/lite/experimental/micro/kernels/*test.cc)
@@ -91,7 +98,8 @@ include $(wildcard $(MAKEFILE_DIR)/targets/*_makefile.inc)
 
 ALL_SRCS := \
 	$(MICRO_SPEECH_TEST_SRCS) \
-	$(PREPROCESSOR_TEST_SRCS) \
+	$(PREPROCESSOR_REFERENCE_TEST_SRCS) \
+	$(PREPROCESSOR_FIXED_TEST_SRCS) \
 	$(MICROLITE_CC_SRCS) \
 	$(MICROLITE_TEST_SRCS)
 
@@ -104,7 +112,8 @@ LIBDIR := $(GENDIR)lib/
 MICROLITE_LIB_PATH := $(LIBDIR)$(MICROLITE_LIB_NAME)
 
 MICRO_SPEECH_TEST_BINARY := $(BINDIR)micro_speech_test
-PREPROCESSOR_TEST_BINARY := $(BINDIR)preprocessor_test
+PREPROCESSOR_REFERENCE_TEST_BINARY := $(BINDIR)preprocessor_reference_test
+PREPROCESSOR_FIXED_TEST_BINARY := $(BINDIR)preprocessor_fixed_test
 
 CXX := $(CC_PREFIX)${TARGET_TOOLCHAIN_PREFIX}g++
 CC := $(CC_PREFIX)${TARGET_TOOLCHAIN_PREFIX}gcc
@@ -113,8 +122,11 @@ AR := $(CC_PREFIX)${TARGET_TOOLCHAIN_PREFIX}ar
 MICRO_SPEECH_TEST_OBJS := $(addprefix $(OBJDIR), \
 $(patsubst %.cc,%.o,$(patsubst %.c,%.o,$(MICRO_SPEECH_TEST_SRCS))))
 
-PREPROCESSOR_TEST_OBJS := $(addprefix $(OBJDIR), \
-$(patsubst %.cc,%.o,$(patsubst %.c,%.o,$(PREPROCESSOR_TEST_SRCS))))
+PREPROCESSOR_REFERENCE_TEST_OBJS := $(addprefix $(OBJDIR), \
+$(patsubst %.cc,%.o,$(patsubst %.c,%.o,$(PREPROCESSOR_REFERENCE_TEST_SRCS))))
+
+PREPROCESSOR_FIXED_TEST_OBJS := $(addprefix $(OBJDIR), \
+$(patsubst %.cc,%.o,$(patsubst %.c,%.o,$(PREPROCESSOR_FIXED_TEST_SRCS))))
 
 MICROLITE_LIB_OBJS := $(addprefix $(OBJDIR), \
 $(patsubst %.cc,%.o,$(patsubst %.c,%.o,$(MICROLITE_CC_SRCS))))
@@ -158,18 +170,29 @@ micro_speech_test_bin: $(MICRO_SPEECH_TEST_BINARY).bin
 test_micro_speech: $(MICRO_SPEECH_TEST_BINARY)
 	$(TEST_SCRIPT) $(MICRO_SPEECH_TEST_BINARY) '~~~ALL TESTS PASSED~~~'
 
-$(PREPROCESSOR_TEST_BINARY): $(PREPROCESSOR_TEST_OBJS) $(MICROLITE_LIB_PATH)
+$(PREPROCESSOR_REFERENCE_TEST_BINARY): $(PREPROCESSOR_REFERENCE_TEST_OBJS) $(MICROLITE_LIB_PATH)
 	@mkdir -p $(dir $@)
 	$(CXX) $(CXXFLAGS) $(INCLUDES) \
-	-o $(PREPROCESSOR_TEST_BINARY) $(PREPROCESSOR_TEST_OBJS) \
+	-o $(PREPROCESSOR_REFERENCE_TEST_BINARY) $(PREPROCESSOR_REFERENCE_TEST_OBJS) \
 	$(LIBFLAGS) $(MICROLITE_LIB_PATH) $(LDFLAGS) $(MICROLITE_LIBS)
 
-preprocessor_test: $(PREPROCESSOR_TEST_BINARY)
-preprocessor_test_bin: $(PREPROCESSOR_TEST_BINARY).bin
+preprocessor_reference_test: $(PREPROCESSOR_REFERENCE_TEST_BINARY)
+preprocessor_reference_test_bin: $(PREPROCESSOR_REFERENCE_TEST_BINARY).bin
 
-test_preprocessor: $(PREPROCESSOR_TEST_BINARY)
-	$(TEST_SCRIPT) $(PREPROCESSOR_TEST_BINARY) '~~~ALL TESTS PASSED~~~'
+test_preprocessor_reference: $(PREPROCESSOR_REFERENCE_TEST_BINARY)
+	$(TEST_SCRIPT) $(PREPROCESSOR_REFERENCE_TEST_BINARY) '~~~ALL TESTS PASSED~~~'
 
+$(PREPROCESSOR_FIXED_TEST_BINARY): $(PREPROCESSOR_FIXED_TEST_OBJS) $(MICROLITE_LIB_PATH)
+	@mkdir -p $(dir $@)
+	$(CXX) $(CXXFLAGS) $(INCLUDES) \
+	-o $(PREPROCESSOR_FIXED_TEST_BINARY) $(PREPROCESSOR_FIXED_TEST_OBJS) \
+	$(LIBFLAGS) $(MICROLITE_LIB_PATH) $(LDFLAGS) $(MICROLITE_LIBS)
+
+preprocessor_fixed_test: $(PREPROCESSOR_FIXED_TEST_BINARY)
+preprocessor_fixed_test_bin: $(PREPROCESSOR_FIXED_TEST_BINARY).bin
+
+test_preprocessor_fixed: $(PREPROCESSOR_FIXED_TEST_BINARY)
+	$(TEST_SCRIPT) $(PREPROCESSOR_FIXED_TEST_BINARY) '~~~ALL TESTS PASSED~~~'
 
 $(BINDIR)%_test : $(OBJDIR)%_test.o $(MICROLITE_LIB_PATH)
 	@mkdir -p $(dir $@)
diff --git a/tensorflow/lite/experimental/micro/tools/make/targets/linux_x86_makefile.inc b/tensorflow/lite/experimental/micro/tools/make/targets/linux_x86_makefile.inc
new file mode 100644
index 00000000000..8ea78e8f3e3
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/tools/make/targets/linux_x86_makefile.inc
@@ -0,0 +1,9 @@
+# Settings for x86 on Linux
+ifeq ($(TARGET), linux)
+  ifeq ($(TARGET_ARCH), x86_64)
+    PLATFORM_FLAGS = \
+      -DTF_LITE_DISABLE_X86_NEON
+    CXXFLAGS += $(PLATFORM_FLAGS)
+    CCFLAGS += $(PLATFORM_FLAGS)
+  endif
+endif
diff --git a/tensorflow/lite/g3doc/_index.yaml b/tensorflow/lite/g3doc/_index.yaml
index 43b5e3cfc01..093f86b5420 100644
--- a/tensorflow/lite/g3doc/_index.yaml
+++ b/tensorflow/lite/g3doc/_index.yaml
@@ -182,34 +182,40 @@ landing_page:
     background: grey
     heading: Updates
     items:
-    - heading: Introducing the Model Optimization Toolkit
+    - heading: "AI in motion: react in the real world"
+      image_path: ./images/landing-page/ai_in_motion.png
+      path: https://cloud.google.com/blog/products/ai-machine-learning/ai-motion-designing-simple-system-see-understand-and-react-real-world-part-ii
+      buttons:
+      - label: Read more
+        path: https://cloud.google.com/blog/products/ai-machine-learning/ai-motion-designing-simple-system-see-understand-and-react-real-world-part-ii
+    - heading: "Introducing the Model Optimization Toolkit"
       image_path: /ecosystem/images/tf-logo-card-16x9.png
       path: https://medium.com/tensorflow/introducing-the-model-optimization-toolkit-for-tensorflow-254aca1ba0a3
       buttons:
       - label: Read on TensorFlow blog
         path: https://medium.com/tensorflow/introducing-the-model-optimization-toolkit-for-tensorflow-254aca1ba0a3
-    - heading: East Africa Cassava App
+    - heading: "East Africa Cassava App"
       image_path: ./images/landing-page/detect_crop_disease_in_africa.png
       path: https://heartbeat.fritz.ai/community-spotlight-nuru-a-mobile-app-by-plantvillage-to-detect-crop-disease-in-africa-28d142bf63d5
       buttons:
       - label: Read more
         path: https://heartbeat.fritz.ai/community-spotlight-nuru-a-mobile-app-by-plantvillage-to-detect-crop-disease-in-africa-28d142bf63d5
-    - heading: Using TensorFlow Lite on Android
+
+  - classname: devsite-landing-row-cards
+    background: grey
+    items:
+    - heading: "Using TensorFlow Lite on Android"
       image_path: /ecosystem/images/tf-logo-card-16x9.png
       path: https://medium.com/tensorflow/using-tensorflow-lite-on-android-9bbc9cb7d69d
       buttons:
       - label: Read on TensorFlow blog
         path: https://medium.com/tensorflow/using-tensorflow-lite-on-android-9bbc9cb7d69d
-
-  - classname: devsite-landing-row-cards
-    background: grey
-    items:
-    - heading: TensorFlow Lite at the Dev Summit
+    - heading: "TensorFlow Lite at the Dev Summit"
       youtube_id: FAMfy7izB6A
       buttons:
       - label: Watch the video
         path: https://www.youtube.com/watch?v=FAMfy7izB6A
-    - heading: TensorFlow Lite on GitHub
+    - heading: "TensorFlow Lite on GitHub"
       image_path: /ecosystem/images/github-card-16x9.png
       path: https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite
       buttons:
diff --git a/tensorflow/lite/g3doc/images/landing-page/ai_in_motion.png b/tensorflow/lite/g3doc/images/landing-page/ai_in_motion.png
new file mode 100644
index 00000000000..b8eedce7eae
Binary files /dev/null and b/tensorflow/lite/g3doc/images/landing-page/ai_in_motion.png differ
diff --git a/tensorflow/lite/graph_info.cc b/tensorflow/lite/graph_info.cc
index cdbe66a3a4f..1cec0d0c290 100644
--- a/tensorflow/lite/graph_info.cc
+++ b/tensorflow/lite/graph_info.cc
@@ -41,31 +41,31 @@ class TfLiteIntArrayView {
   const TfLiteIntArray* int_array_;
 };
 
-// Helper class that actually performs partitioning by subgraph.
-// Outputs to a provided `subgraphs` structure.
+// Helper class that actually performs partitioning by node sub set.
+// Outputs to a provided `NodeSubset` structure.
 //
 // Example usage:
-// PartitionGraphIntoIndependentSubgraphsImpl partitioner(
-//     info, nodes_to_part, subgraphs);
+// PartitionGraphIntoIndependentNodeSubsetsImpl partitioner(
+//     info, nodes_to_part, node_subsets);
 // partitioner.Partition();
-class PartitionGraphIntoIndependentSubgraphsImpl {
+class PartitionGraphIntoIndependentNodeSubsetsImpl {
  public:
-  PartitionGraphIntoIndependentSubgraphsImpl(
+  PartitionGraphIntoIndependentNodeSubsetsImpl(
       const GraphInfo* info, const TfLiteIntArray* nodes_to_partition,
-      std::vector<Subgraph>* subgraphs)
+      std::vector<NodeSubset>* node_subsets)
       : info_(info),
-        subgraphs_(subgraphs),
-        node_type_(info->num_nodes(), Subgraph::kTfNonPartition) {
+        node_subsets_(node_subsets),
+        node_type_(info->num_nodes(), NodeSubset::kTfNonPartition) {
     // Populate the node_type_ map.
     for (auto node_index : TfLiteIntArrayView(nodes_to_partition)) {
-      node_type_[node_index] = Subgraph::kTfPartition;
+      node_type_[node_index] = NodeSubset::kTfPartition;
     }
   }
 
   // Actually partition the graph.
   void Partition() {
     // Initialize here to make Partition() re-entrant.
-    subgraphs_->clear();
+    node_subsets_->clear();
     tensor_epochs_.clear();
     tensor_epochs_.resize(info_->num_tensors(), kEpochAlwaysReady);
     node_epochs_.clear();
@@ -80,35 +80,35 @@ class PartitionGraphIntoIndependentSubgraphsImpl {
     }
 
     // Do a graph traversal where each iteration in the loop is an epoch
-    // that corresponds to a subgraph that only contains nodes that are of
+    // that corresponds to a node sub set that only contains nodes that are of
     // the same node_type_.
     while (true) {
-      BuildSubgraph();
-      if (subgraphs_->back().nodes.empty()) {
-        subgraphs_->pop_back();
+      BuildNodeSubset();
+      if (node_subsets_->back().nodes.empty()) {
+        node_subsets_->pop_back();
         break;
       }
     }
 
-    // Mark model outputs as subgraph outputs. All the rest have already been
-    // identified.
+    // Mark model outputs as node sub set outputs. All the rest have already
+    // been identified.
     for (int output_index : info_->outputs()) {
       int output_epoch = tensor_epochs_[output_index];
-      Subgraph& output_subgraph = (*subgraphs_)[output_epoch];
-      output_subgraph.output_tensors.push_back(output_index);
+      NodeSubset& output_subset = (*node_subsets_)[output_epoch];
+      output_subset.output_tensors.push_back(output_index);
     }
-    // Make sure every subgraph's inputs and outputs are unique. Since the
+    // Make sure every node sub set's inputs and outputs are unique. Since the
     // list of inputs and outputs is generated in a way that produces
     // duplicates.
-    for (Subgraph& subgraph : *subgraphs_) {
+    for (NodeSubset& node_subset : *node_subsets_) {
       // Sort and uniquefy using standard library algorithms.
       auto uniquefy = [](std::vector<int>* items) {
         std::sort(items->begin(), items->end());
         auto last = std::unique(items->begin(), items->end());
         items->erase(last, items->end());
       };
-      uniquefy(&subgraph.input_tensors);
-      uniquefy(&subgraph.output_tensors);
+      uniquefy(&node_subset.input_tensors);
+      uniquefy(&node_subset.output_tensors);
     }
   }
 
@@ -129,14 +129,14 @@ class PartitionGraphIntoIndependentSubgraphsImpl {
   // epoch since the epoch's node_type doesn't match.
   bool UpdateNode(int node_index) {
     const TfLiteNode& node = info_->node(node_index);
-    Subgraph& current_subgraph = subgraphs_->back();
-    int current_epoch = subgraphs_->size() - 1;
+    NodeSubset& current_subset = node_subsets_->back();
+    int current_epoch = node_subsets_->size() - 1;
     // Check if node is already done.
     if (node_epochs_[node_index] != kEpochNotReady) {
       return false;
     }
     // See if all dependencies of this node are already assigned to a
-    // subgraph.
+    // node sub set.
     for (int input_tensor_index : TfLiteIntArrayView(node.inputs)) {
       if (tensor_epochs_[input_tensor_index] == kEpochNotReady) {
         return false;
@@ -144,16 +144,16 @@ class PartitionGraphIntoIndependentSubgraphsImpl {
     }
     // When we are starting a new epoch, the first ready node defines
     // the type of that epoch.
-    if (current_subgraph.type == Subgraph::kTfUnexplored) {
-      current_subgraph.type = node_type_[node_index];
+    if (current_subset.type == NodeSubset::kTfUnexplored) {
+      current_subset.type = node_type_[node_index];
     }
     // The node gets assigned to this epoch if it is the same type as
     // the epoch's assigned type. Note, if this is the current ready
     // node encountered during this epoch, this condition will be
     // automatically true.
-    if (current_subgraph.type == node_type_[node_index]) {
+    if (current_subset.type == node_type_[node_index]) {
       node_epochs_[node_index] = current_epoch;
-      current_subgraph.nodes.push_back(node_index);
+      current_subset.nodes.push_back(node_index);
       // All outputs of this node now are assigned to this epoch as
       // well.
       for (int output_tensor_index : TfLiteIntArrayView(node.outputs)) {
@@ -165,13 +165,13 @@ class PartitionGraphIntoIndependentSubgraphsImpl {
         int input_epoch = tensor_epochs_[input_tensor_index];
         int node_epoch = current_epoch;
         if (input_epoch != node_epoch) {
-          current_subgraph.input_tensors.push_back(input_tensor_index);
-          // Set inputs to be outputs of the subgraph where they reside.
+          current_subset.input_tensors.push_back(input_tensor_index);
+          // Set inputs to be outputs of the node sub set where they reside.
           // the if condition makes sure inputs to the whole computation
           // are not included (i.e. those initialized to -2 above).
           if (input_epoch >= 0) {
-            Subgraph& input_subgraph = (*subgraphs_)[input_epoch];
-            input_subgraph.output_tensors.push_back(input_tensor_index);
+            NodeSubset& input_subset = (*node_subsets_)[input_epoch];
+            input_subset.output_tensors.push_back(input_tensor_index);
           }
         }
       }
@@ -181,9 +181,9 @@ class PartitionGraphIntoIndependentSubgraphsImpl {
     }
   }
 
-  // Completely populates the current subgraph by doing graph traversal
-  void BuildSubgraph() {
-    subgraphs_->emplace_back(Subgraph());
+  // Completely populates the current node_subset by doing graph traversal
+  void BuildNodeSubset() {
+    node_subsets_->emplace_back(NodeSubset());
     // loop until no more nodes can be updated.
     while (true) {
       bool did_something = false;
@@ -198,9 +198,9 @@ class PartitionGraphIntoIndependentSubgraphsImpl {
 
   // Temporary data needed for partitioning.
   const GraphInfo* info_;
-  // List of subgraphs to populate
-  std::vector<Subgraph>* subgraphs_;
-  std::vector<Subgraph::Type> node_type_;
+  // List of node_subsets to populate
+  std::vector<NodeSubset>* node_subsets_;
+  std::vector<NodeSubset::Type> node_type_;
   // Maps from tensor index to the epoch in which it is assigned. Also special
   // negative values of kEpochNotAssigned if not assigned, kEpochNotReady if it
   // is an input or constant.
@@ -212,11 +212,11 @@ class PartitionGraphIntoIndependentSubgraphsImpl {
 
 }  // namespace
 
-TfLiteStatus PartitionGraphIntoIndependentSubgraphs(
+TfLiteStatus PartitionGraphIntoIndependentNodeSubsets(
     const GraphInfo* info, const TfLiteIntArray* nodes_to_partition,
-    std::vector<Subgraph>* subgraphs) {
-  PartitionGraphIntoIndependentSubgraphsImpl(info, nodes_to_partition,
-                                             subgraphs)
+    std::vector<NodeSubset>* node_subsets) {
+  PartitionGraphIntoIndependentNodeSubsetsImpl(info, nodes_to_partition,
+                                               node_subsets)
       .Partition();
   return kTfLiteOk;
 }
diff --git a/tensorflow/lite/graph_info.h b/tensorflow/lite/graph_info.h
index ff7ce669ace..4da696c132e 100644
--- a/tensorflow/lite/graph_info.h
+++ b/tensorflow/lite/graph_info.h
@@ -51,31 +51,32 @@ class GraphInfo {
   virtual const std::vector<int>& variables() const = 0;
 };
 
-// Represents a subgraph of a TensorFlow Lite graph.
-struct Subgraph {
+// Represents a subset of nodes in a TensorFlow Lite graph.
+struct NodeSubset {
   enum Type {
     kTfUnexplored = 0,  // temporarily used during creation
     kTfPartition,
     kTfNonPartition
   };
   Type type = kTfUnexplored;
-  // Nodes within the subgraph
+  // Nodes within the node sub set
   std::vector<int> nodes;
-  // Tensors that stride output from another subgraph that this depends on,
+  // Tensors that stride output from another node sub set that this depends on,
   // or global inputs to the TensorFlow Lite full graph.
   std::vector<int> input_tensors;
-  // Outputs that are consumed by other subgraphs or are global output tensors.
-  // All output tensors of the nodes in the subgraph that do not appear in this
-  // list are intermediate results that can be potentially elided.
+  // Outputs that are consumed by other node sub sets or are global output
+  // tensors. All output tensors of the nodes in the node sub set that do not
+  // appear in this list are intermediate results that can be potentially
+  // elided.
   std::vector<int> output_tensors;
 };
 
-// Partitions a list of node indices `nodes_to_partition` into subgraphs.
-// Each subgraph is in dependency order (i.e. all members of the subgraph).
-// `subgraphs` is assumed to be empty.
-TfLiteStatus PartitionGraphIntoIndependentSubgraphs(
+// Partitions a list of node indices `nodes_to_partition` into node sub sets.
+// Each node sub set is in dependency order (i.e. all members of the node sub
+// sets). `node_subsets` is assumed to be empty.
+TfLiteStatus PartitionGraphIntoIndependentNodeSubsets(
     const GraphInfo* info, const TfLiteIntArray* nodes_to_partition,
-    std::vector<Subgraph>* subgraphs);
+    std::vector<NodeSubset>* node_subsets);
 
 }  // namespace tflite
 
diff --git a/tensorflow/lite/graph_info_test.cc b/tensorflow/lite/graph_info_test.cc
index 5ecc3774e13..4d8bbdc0eef 100644
--- a/tensorflow/lite/graph_info_test.cc
+++ b/tensorflow/lite/graph_info_test.cc
@@ -76,17 +76,18 @@ class SimpleTestGraph : public GraphInfo {
 // TfLiteIntArray. Populates `subgraphs` with resulting generated subgraphs.
 void PartitionGraph(const SimpleTestGraph& graph,
                     const std::vector<int>& nodes_to_partition,
-                    std::vector<Subgraph>* subgraphs) {
+                    std::vector<NodeSubset>* subgraphs) {
   TfLiteIntArray* nodes_to_partition_int_array =
       ConvertVector(nodes_to_partition);
-  PartitionGraphIntoIndependentSubgraphs(&graph, nodes_to_partition_int_array,
-                                         subgraphs);
+  PartitionGraphIntoIndependentNodeSubsets(&graph, nodes_to_partition_int_array,
+                                           subgraphs);
   TfLiteIntArrayFree(nodes_to_partition_int_array);
 }
 
 // Check a generated list of subgraphs against the expected list of subgraphs.
-void CheckPartitionSubgraphs(const std::vector<Subgraph>& generated_subgraphs,
-                             const std::vector<Subgraph>& expected_subgraphs) {
+void CheckPartitionSubgraphs(
+    const std::vector<NodeSubset>& generated_subgraphs,
+    const std::vector<NodeSubset>& expected_subgraphs) {
   ASSERT_EQ(generated_subgraphs.size(), expected_subgraphs.size());
   for (int subgraph_index = 0; subgraph_index < generated_subgraphs.size();
        subgraph_index++) {
@@ -103,7 +104,7 @@ void CheckPartitionSubgraphs(const std::vector<Subgraph>& generated_subgraphs,
 TEST(PartitionTest, Nodes0_PartitionNodes0) {
   SimpleTestGraph graph;
   std::vector<int> nodes_to_partition = {};
-  std::vector<Subgraph> generated_subgraphs;
+  std::vector<NodeSubset> generated_subgraphs;
   PartitionGraph(graph, nodes_to_partition, &generated_subgraphs);
   CheckPartitionSubgraphs(generated_subgraphs, {});
 }
@@ -117,11 +118,11 @@ TEST(PartitionTest, Nodes1PartitionNodes0) {
   graph.AddNode({0}, {1});
   graph.SetInputsAndOutputs({0}, {1});
   std::vector<int> nodes_to_partition = {};
-  std::vector<Subgraph> generated_subgraphs;
+  std::vector<NodeSubset> generated_subgraphs;
   PartitionGraph(graph, nodes_to_partition, &generated_subgraphs);
 
-  Subgraph expected_subgraph;
-  expected_subgraph.type = Subgraph::kTfNonPartition;
+  NodeSubset expected_subgraph;
+  expected_subgraph.type = NodeSubset::kTfNonPartition;
   expected_subgraph.nodes = {0};
   expected_subgraph.input_tensors = {0};
   expected_subgraph.output_tensors = {1};
@@ -136,12 +137,12 @@ TEST(PartitionTest, Nodes1PartitionNodes0Inputs0) {
   graph.AddTensors(1);
   graph.AddNode({}, {0});
   graph.SetInputsAndOutputs({}, {0});
-  std::vector<Subgraph> generated_subgraphs;
+  std::vector<NodeSubset> generated_subgraphs;
   std::vector<int> nodes_to_partition = {0};
   PartitionGraph(graph, nodes_to_partition, &generated_subgraphs);
 
-  Subgraph expected_subgraph;
-  expected_subgraph.type = Subgraph::kTfPartition;
+  NodeSubset expected_subgraph;
+  expected_subgraph.type = NodeSubset::kTfPartition;
   expected_subgraph.nodes = {0};
   expected_subgraph.input_tensors = {};
   expected_subgraph.output_tensors = {0};
@@ -157,11 +158,11 @@ TEST(PartitionTest, Nodes1PartitionNodes1) {
   graph.AddNode({0}, {1});
   graph.SetInputsAndOutputs({0}, {1});
   std::vector<int> nodes_to_partition = {0};
-  std::vector<Subgraph> generated_subgraphs;
+  std::vector<NodeSubset> generated_subgraphs;
   PartitionGraph(graph, nodes_to_partition, &generated_subgraphs);
 
-  Subgraph expected_subgraph;
-  expected_subgraph.type = Subgraph::kTfPartition;
+  NodeSubset expected_subgraph;
+  expected_subgraph.type = NodeSubset::kTfPartition;
   expected_subgraph.nodes = {0};
   expected_subgraph.input_tensors = {0};
   expected_subgraph.output_tensors = {1};
@@ -180,16 +181,16 @@ TEST(PartitionTest, Nodes2PartitionNodes1) {
   graph.AddNode({1}, {2});
   graph.SetInputsAndOutputs({0}, {2});
   std::vector<int> nodes_to_partition = {1};
-  std::vector<Subgraph> generated_subgraphs;
+  std::vector<NodeSubset> generated_subgraphs;
   PartitionGraph(graph, nodes_to_partition, &generated_subgraphs);
 
-  Subgraph expected_subgraph0;
-  expected_subgraph0.type = Subgraph::kTfPartition;
+  NodeSubset expected_subgraph0;
+  expected_subgraph0.type = NodeSubset::kTfPartition;
   expected_subgraph0.nodes = {0};
   expected_subgraph0.input_tensors = {0};
   expected_subgraph0.output_tensors = {1};
-  Subgraph expected_subgraph1;
-  expected_subgraph1.type = Subgraph::kTfPartition;
+  NodeSubset expected_subgraph1;
+  expected_subgraph1.type = NodeSubset::kTfPartition;
   expected_subgraph1.nodes = {1};
   expected_subgraph1.input_tensors = {1};
   expected_subgraph1.output_tensors = {2};
@@ -208,11 +209,11 @@ TEST(PartitionTest, Nodes2PartitionNodes2) {
   graph.AddNode({1}, {2});
   graph.SetInputsAndOutputs({0}, {2});
   std::vector<int> nodes_to_partition = {0, 1};
-  std::vector<Subgraph> generated_subgraphs;
+  std::vector<NodeSubset> generated_subgraphs;
   PartitionGraph(graph, nodes_to_partition, &generated_subgraphs);
 
-  Subgraph expected_subgraph0;
-  expected_subgraph0.type = Subgraph::kTfPartition;
+  NodeSubset expected_subgraph0;
+  expected_subgraph0.type = NodeSubset::kTfPartition;
   expected_subgraph0.nodes = {0, 1};
   expected_subgraph0.input_tensors = {0};
   expected_subgraph0.output_tensors = {2};
@@ -239,21 +240,21 @@ TEST(PartitionTest, Nodes3PartitionNodes2) {
   graph.AddNode({1, 2}, {3});
   graph.SetInputsAndOutputs({0}, {3});
   std::vector<int> nodes_to_partition = {0, 2};
-  std::vector<Subgraph> generated_subgraphs;
+  std::vector<NodeSubset> generated_subgraphs;
   PartitionGraph(graph, nodes_to_partition, &generated_subgraphs);
 
-  Subgraph expected_subgraph0;
-  expected_subgraph0.type = Subgraph::kTfPartition;
+  NodeSubset expected_subgraph0;
+  expected_subgraph0.type = NodeSubset::kTfPartition;
   expected_subgraph0.nodes = {0};
   expected_subgraph0.input_tensors = {0};
   expected_subgraph0.output_tensors = {1};
-  Subgraph expected_subgraph1;
-  expected_subgraph1.type = Subgraph::kTfNonPartition;
+  NodeSubset expected_subgraph1;
+  expected_subgraph1.type = NodeSubset::kTfNonPartition;
   expected_subgraph1.nodes = {1};
   expected_subgraph1.input_tensors = {1};
   expected_subgraph1.output_tensors = {2};
-  Subgraph expected_subgraph2;
-  expected_subgraph2.type = Subgraph::kTfPartition;
+  NodeSubset expected_subgraph2;
+  expected_subgraph2.type = NodeSubset::kTfPartition;
   expected_subgraph2.nodes = {2};
   expected_subgraph2.input_tensors = {1, 2};
   expected_subgraph2.output_tensors = {3};
diff --git a/tensorflow/lite/interpreter.cc b/tensorflow/lite/interpreter.cc
index bff7145de99..c90fc3be87e 100644
--- a/tensorflow/lite/interpreter.cc
+++ b/tensorflow/lite/interpreter.cc
@@ -154,7 +154,7 @@ Interpreter::~Interpreter() {
     node.builtin_data = nullptr;
   }
 
-  for (int i = 0; i < context_.tensors_size; i++) {
+  for (size_t i = 0; i < context_.tensors_size; i++) {
     TfLiteTensor* tensor = &context_.tensors[i];
     if (tensor->buffer_handle != kTfLiteNullBufferHandle &&
         tensor->delegate->FreeBufferHandle != nullptr) {
@@ -165,12 +165,12 @@ Interpreter::~Interpreter() {
   }
 }
 
-TfLiteStatus Interpreter::ReplaceSubgraphsWithDelegateKernels(
+TfLiteStatus Interpreter::ReplaceNodeSubsetsWithDelegateKernels(
     TfLiteContext* context, TfLiteRegistration registration,
     const TfLiteIntArray* nodes_to_replace, TfLiteDelegate* delegate) {
   return static_cast<Interpreter*>(context->impl_)
-      ->ReplaceSubgraphsWithDelegateKernels(registration, nodes_to_replace,
-                                            delegate);
+      ->ReplaceNodeSubsetsWithDelegateKernels(registration, nodes_to_replace,
+                                              delegate);
 }
 
 namespace {
@@ -203,20 +203,20 @@ void CopyVectorToTfLiteIntArray(const std::vector<int>& vec,
 // | TfLiteIntArray (variable size)    |<-------/
 // +-----------------------------------+
 TfLiteDelegateParams* CreateDelegateParams(TfLiteDelegate* delegate,
-                                           const Subgraph& subgraph) {
+                                           const NodeSubset& node_subset) {
   // Step 1: Calculate the allocation size.
   int allocation_size = sizeof(TfLiteDelegateParams);
 
   int nodes_to_replace_size =
-      TfLiteIntArrayGetSizeInBytes(subgraph.nodes.size());
+      TfLiteIntArrayGetSizeInBytes(node_subset.nodes.size());
   allocation_size += nodes_to_replace_size;
 
   int input_tensors_size =
-      TfLiteIntArrayGetSizeInBytes(subgraph.input_tensors.size());
+      TfLiteIntArrayGetSizeInBytes(node_subset.input_tensors.size());
   allocation_size += input_tensors_size;
 
   int output_tensors_size =
-      TfLiteIntArrayGetSizeInBytes(subgraph.output_tensors.size());
+      TfLiteIntArrayGetSizeInBytes(node_subset.output_tensors.size());
   allocation_size += output_tensors_size;
 
   // Step 2: Allocate the memory.
@@ -230,15 +230,16 @@ TfLiteDelegateParams* CreateDelegateParams(TfLiteDelegate* delegate,
   allocation += sizeof(TfLiteDelegateParams);
 
   params->nodes_to_replace = reinterpret_cast<TfLiteIntArray*>(allocation);
-  CopyVectorToTfLiteIntArray(subgraph.nodes, params->nodes_to_replace);
+  CopyVectorToTfLiteIntArray(node_subset.nodes, params->nodes_to_replace);
   allocation += nodes_to_replace_size;
 
   params->input_tensors = reinterpret_cast<TfLiteIntArray*>(allocation);
-  CopyVectorToTfLiteIntArray(subgraph.input_tensors, params->input_tensors);
+  CopyVectorToTfLiteIntArray(node_subset.input_tensors, params->input_tensors);
   allocation += input_tensors_size;
 
   params->output_tensors = reinterpret_cast<TfLiteIntArray*>(allocation);
-  CopyVectorToTfLiteIntArray(subgraph.output_tensors, params->output_tensors);
+  CopyVectorToTfLiteIntArray(node_subset.output_tensors,
+                             params->output_tensors);
   allocation += output_tensors_size;
 
   return params;
@@ -246,40 +247,42 @@ TfLiteDelegateParams* CreateDelegateParams(TfLiteDelegate* delegate,
 
 }  // namespace
 
-TfLiteStatus Interpreter::ReplaceSubgraphsWithDelegateKernels(
+TfLiteStatus Interpreter::ReplaceNodeSubsetsWithDelegateKernels(
     TfLiteRegistration registration, const TfLiteIntArray* nodes_to_replace,
     TfLiteDelegate* delegate) {
   // Annotate the registration as DELEGATE op.
   registration.builtin_code = BuiltinOperator_DELEGATE;
 
-  // Analyze the graph to find all independent subgraphs that are either
+  // Analyze the graph to find all independent node_subsets that are either
   // fully not-this-delegate or this-delegate computation.
   InterpreterInfo info(this);
-  std::vector<Subgraph> subgraphs;
-  PartitionGraphIntoIndependentSubgraphs(&info, nodes_to_replace, &subgraphs);
+  std::vector<NodeSubset> node_subsets;
+  PartitionGraphIntoIndependentNodeSubsets(&info, nodes_to_replace,
+                                           &node_subsets);
 
   execution_plan_.clear();
-  for (auto& subgraph : subgraphs) {
-    // Subgraphs calimed by the delegate should have a "macro" op created, the
-    // other subgraphs (kTfNonPartition) just have their nodes added back to
+  for (auto& node_subset : node_subsets) {
+    // Subsets calimed by the delegate should have a "macro" op created, the
+    // other node_subsets (kTfNonPartition) just have their nodes added back to
     // the execution plan.
-    switch (subgraph.type) {
-      case Subgraph::kTfNonPartition:
-        for (auto it = subgraph.nodes.begin(); it != subgraph.nodes.end();
+    switch (node_subset.type) {
+      case NodeSubset::kTfNonPartition:
+        for (auto it = node_subset.nodes.begin(); it != node_subset.nodes.end();
              ++it) {
           execution_plan_.push_back(*it);
         }
         break;
-      case Subgraph::kTfPartition: {
+      case NodeSubset::kTfPartition: {
         int node_index;
 
-        TfLiteDelegateParams* params = CreateDelegateParams(delegate, subgraph);
+        TfLiteDelegateParams* params =
+            CreateDelegateParams(delegate, node_subset);
         TF_LITE_ENSURE_STATUS(AddNodeWithParameters(
-            subgraph.input_tensors, subgraph.output_tensors, nullptr, 0, params,
-            &registration, &node_index));
+            node_subset.input_tensors, node_subset.output_tensors, nullptr, 0,
+            params, &registration, &node_index));
 
         // Initialize the output tensors's delegate-related fields.
-        for (int tensor_index : subgraph.output_tensors) {
+        for (int tensor_index : node_subset.output_tensors) {
           TfLiteTensor* tensor = &tensors_[tensor_index];
           TF_LITE_ENSURE(&context_, tensor->delegate == nullptr ||
                                         tensor->delegate == delegate);
@@ -290,7 +293,7 @@ TfLiteStatus Interpreter::ReplaceSubgraphsWithDelegateKernels(
         TfLiteNode* node = &nodes_and_registration_[node_index].first;
         node->delegate = delegate;
       } break;
-      case Subgraph::kTfUnexplored:
+      case NodeSubset::kTfUnexplored:
         return kTfLiteError;
         break;
     }
@@ -525,7 +528,7 @@ TfLiteStatus Interpreter::AddNodeWithParameters(
 
   node.builtin_data = builtin_data_deleter.release();
   // TODO(ycling): Filling `custom_initial_data` and `custom_initial_data_size`
-  // properly for nodes generated by ReplaceSubgraphsWithDelegateKernels.
+  // properly for nodes generated by ReplaceNodeSubsetsWithDelegateKernels.
 
   if (registration->builtin_code == BuiltinOperator_CUSTOM) {
     // When it's a CUSTOM op, the `custom_options` field in the Flatbuffer
@@ -646,9 +649,6 @@ TfLiteStatus Interpreter::Invoke() {
   // Note that calling Invoke repeatedly will cause the original memory plan to
   // be reused, unless either ResizeInputTensor() or AllocateTensors() has been
   // called.
-  // TODO(b/71913981): we should force recalculation in the presence of dynamic
-  // tensors, because they may have new value which in turn may affect shapes
-  // and allocations.
   for (int execution_plan_index = 0;
        execution_plan_index < execution_plan_.size(); execution_plan_index++) {
     if (execution_plan_index == next_execution_plan_index_to_prepare_) {
@@ -729,10 +729,10 @@ void Interpreter::ReportError(TfLiteContext* context, const char* format, ...) {
 
 TfLiteStatus Interpreter::AddTensors(int tensors_to_add,
                                      int* first_new_tensor_index) {
-  int base_index = tensors_.size();
+  const size_t base_index = tensors_.size();
   if (first_new_tensor_index) *first_new_tensor_index = base_index;
   tensors_.resize(tensors_.size() + tensors_to_add);
-  for (int i = base_index; i < tensors_.size(); i++) {
+  for (size_t i = base_index; i < tensors_.size(); i++) {
     memset(&tensors_[i], 0, sizeof(tensors_[i]));
     tensors_[i].buffer_handle = kTfLiteNullBufferHandle;
   }
@@ -752,7 +752,8 @@ TfLiteStatus Interpreter::AddTensors(TfLiteContext* context, int tensors_to_add,
 
 TfLiteStatus Interpreter::GetNodeAndRegistration(
     int node_index, TfLiteNode** node, TfLiteRegistration** registration) {
-  TF_LITE_ENSURE(&context_, node_index < nodes_size() && node_index >= 0);
+  TF_LITE_ENSURE(&context_, node_index >= 0);
+  TF_LITE_ENSURE(&context_, static_cast<size_t>(node_index) < nodes_size());
   TF_LITE_ENSURE(&context_, node != nullptr && registration != nullptr);
   *node = &nodes_and_registration_[node_index].first;
   *registration = &nodes_and_registration_[node_index].second;
@@ -922,14 +923,14 @@ void Interpreter::SetNumThreads(int num_threads) {
 
 void Interpreter::SwitchToDelegateContext() {
   context_.GetNodeAndRegistration = GetNodeAndRegistration;
-  context_.ReplaceSubgraphsWithDelegateKernels =
-      ReplaceSubgraphsWithDelegateKernels;
+  context_.ReplaceNodeSubsetsWithDelegateKernels =
+      ReplaceNodeSubsetsWithDelegateKernels;
   context_.GetExecutionPlan = GetExecutionPlan;
 }
 
 void Interpreter::SwitchToKernelContext() {
   SetForbiddenContextFunction(&context_.GetNodeAndRegistration);
-  SetForbiddenContextFunction(&context_.ReplaceSubgraphsWithDelegateKernels);
+  SetForbiddenContextFunction(&context_.ReplaceNodeSubsetsWithDelegateKernels);
   SetForbiddenContextFunction(&context_.GetExecutionPlan);
 }
 
diff --git a/tensorflow/lite/interpreter.h b/tensorflow/lite/interpreter.h
index 7178b201ecc..415c5f0979c 100644
--- a/tensorflow/lite/interpreter.h
+++ b/tensorflow/lite/interpreter.h
@@ -531,8 +531,8 @@ class Interpreter {
                                  int* first_new_tensor_index);
 
   // WARNING: This is an experimental API and subject to change.
-  // Entry point for C API ReplaceSubgraphsWithDelegateKernels
-  static TfLiteStatus ReplaceSubgraphsWithDelegateKernels(
+  // Entry point for C API ReplaceNodeSubsetsWithDelegateKernels
+  static TfLiteStatus ReplaceNodeSubsetsWithDelegateKernels(
       TfLiteContext* context, TfLiteRegistration registration,
       const TfLiteIntArray* nodes_to_replace, TfLiteDelegate* delegate);
 
@@ -541,7 +541,7 @@ class Interpreter {
   // slated for replacement with a delegate kernel specified by registration.
   // Ownership of 'nodes_to_replace' and 'delegate' remains with the caller.
   // WARNING: This is an experimental interface that is subject to change.
-  TfLiteStatus ReplaceSubgraphsWithDelegateKernels(
+  TfLiteStatus ReplaceNodeSubsetsWithDelegateKernels(
       TfLiteRegistration registration, const TfLiteIntArray* nodes_to_replace,
       TfLiteDelegate* delegate);
 
diff --git a/tensorflow/lite/interpreter_test.cc b/tensorflow/lite/interpreter_test.cc
index f47f698d3a5..7f03c3ceba1 100644
--- a/tensorflow/lite/interpreter_test.cc
+++ b/tensorflow/lite/interpreter_test.cc
@@ -698,7 +698,7 @@ TEST(BasicInterpreter, TestUnsupportedDelegateFunctions) {
                                                   nullptr};
       TfLiteIntArray nodes_to_replace;
       nodes_to_replace.size = 0;
-      EXPECT_EQ(context->ReplaceSubgraphsWithDelegateKernels(
+      EXPECT_EQ(context->ReplaceNodeSubsetsWithDelegateKernels(
                     context, delegate_registration, &nodes_to_replace, nullptr),
                 kTfLiteError);
     }
@@ -1085,7 +1085,7 @@ class TestDelegate : public ::testing::Test {
           TFLITE_CHECK_EQ(strcmp(reg->custom_name, "my_add"), 0);
         }
 
-        context->ReplaceSubgraphsWithDelegateKernels(
+        context->ReplaceNodeSubsetsWithDelegateKernels(
             context, FakeFusedRegistration(), nodes_to_separate, delegate);
         TfLiteIntArrayFree(nodes_to_separate);
         return kTfLiteOk;
@@ -1265,7 +1265,7 @@ class TestDelegateWithDynamicTensors : public ::testing::Test {
       TfLiteIntArray* execution_plan;
       TF_LITE_ENSURE_STATUS(
           context->GetExecutionPlan(context, &execution_plan));
-      context->ReplaceSubgraphsWithDelegateKernels(
+      context->ReplaceNodeSubsetsWithDelegateKernels(
           context, DelegateRegistration(), execution_plan, delegate);
       return kTfLiteOk;
     };
@@ -1320,6 +1320,7 @@ TEST(TestDelegateOwnership, ProperlyDisposed) {
   struct TfLiteInterpreterOwnedDelegate : public TfLiteDelegate {
     TfLiteInterpreterOwnedDelegate(bool* destroyed, bool* prepared)
         : destroyed(destroyed), prepared(prepared) {
+      flags = kTfLiteDelegateFlagsNone;
       Prepare = [](TfLiteContext*, TfLiteDelegate* delegate) -> TfLiteStatus {
         *static_cast<TfLiteInterpreterOwnedDelegate*>(delegate)->prepared =
             true;
diff --git a/tensorflow/lite/java/BUILD b/tensorflow/lite/java/BUILD
index e6f47a9773a..adf7bc90878 100644
--- a/tensorflow/lite/java/BUILD
+++ b/tensorflow/lite/java/BUILD
@@ -90,7 +90,6 @@ java_test(
     size = "small",
     srcs = ["src/test/java/org/tensorflow/lite/TensorFlowLiteTest.java"],
     javacopts = JAVACOPTS,
-    tags = ["no_oss"],
     test_class = "org.tensorflow.lite.TensorFlowLiteTest",
     deps = [
         ":tensorflowlitelib",
@@ -104,7 +103,6 @@ java_test(
     size = "small",
     srcs = ["src/test/java/org/tensorflow/lite/DataTypeTest.java"],
     javacopts = JAVACOPTS,
-    tags = ["no_oss"],
     test_class = "org.tensorflow.lite.DataTypeTest",
     deps = [
         ":tensorflowlitelib",
@@ -122,11 +120,11 @@ java_test(
         "src/testdata/int32.bin",
         "src/testdata/int64.bin",
         "src/testdata/invalid_model.bin",
+        "src/testdata/quantized.bin",
         "src/testdata/uint8.bin",
         "src/testdata/with_custom_op.lite",
     ],
     javacopts = JAVACOPTS,
-    tags = ["no_oss"],
     test_class = "org.tensorflow.lite.NativeInterpreterWrapperTest",
     deps = [
         ":tensorflowlitelib",
@@ -142,11 +140,10 @@ java_test(
     srcs = ["src/test/java/org/tensorflow/lite/InterpreterTest.java"],
     data = [
         "src/testdata/add.bin",
-        "src/testdata/mobilenet.tflite.bin",
+        "//tensorflow/lite:testdata/multi_add.bin",
         "//tensorflow/lite:testdata/multi_add_flex.bin",
     ],
     javacopts = JAVACOPTS,
-    tags = ["no_oss"],
     test_class = "org.tensorflow.lite.InterpreterTest",
     visibility = ["//visibility:private"],
     deps = [
@@ -165,7 +162,6 @@ java_test(
         "//tensorflow/lite:testdata/multi_add_flex.bin",
     ],
     javacopts = JAVACOPTS,
-    tags = ["no_oss"],
     test_class = "org.tensorflow.lite.InterpreterFlexTest",
     visibility = ["//visibility:private"],
     deps = [
@@ -183,7 +179,6 @@ java_test(
         "src/testdata/add.bin",
     ],
     javacopts = JAVACOPTS,
-    tags = ["no_oss"],
     test_class = "org.tensorflow.lite.TensorTest",
     deps = [
         ":tensorflowlitelib",
diff --git a/tensorflow/lite/java/ovic/README.md b/tensorflow/lite/java/ovic/README.md
index 9e3ceb7e18e..368c486f4f1 100644
--- a/tensorflow/lite/java/ovic/README.md
+++ b/tensorflow/lite/java/ovic/README.md
@@ -97,9 +97,17 @@ filegroup(
     ...
 ```
 
-* Modify `OvicClassifierTest.java` and `OvicDetectorTest.java` to test your model.
+* For classification models, modify `OvicClassifierTest.java`:
+  * change `TEST_IMAGE_PATH` to `my_test_image.jpg`.
 
-Change `TEST_IMAGE_PATH` to `my_test_image.jpg`. Change either `FLOAT_MODEL_PATH` or `QUANTIZED_MODEL_PATH` to `my_model.lite` depending on whether your model runs inference in float or [8-bit](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/quantize).
+  * change either `FLOAT_MODEL_PATH` or `QUANTIZED_MODEL_PATH` to `my_model.lite` depending on whether your model runs inference in float or [8-bit](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/quantize).
+
+  * change `TEST_IMAGE_GROUNDTRUTH` (ImageNet class ID) to be consistent with your test image.
+
+* For detection models, modify `OvicDetectorTest.java`:
+  * change `TEST_IMAGE_PATH` to `my_test_image.jpg`.
+  * change `MODEL_PATH` to `my_model.lite`.
+  * change `GROUNDTRUTH` (COCO class ID) to be consistent with your test image.
 
 Now you can run the bazel tests to catch any runtime issues with the submission.
 
@@ -115,12 +123,17 @@ Make sure that you have followed instructions in [Test your submissions](#test-y
 
 Modify `tensorflow/lite/java/ovic/demo/app/OvicBenchmarkerActivity.java`:
 
-* Add your model to the benchmarker apk by changing `MODEL_PATH` and `TEST_IMAGE_PATH` below to your submission and test image.
+* Add your model to the benchmarker apk by changing `modelPath` and `testImagePath` to your submission and test image.
 
 ```
-  private static final String TEST_IMAGE_PATH = "my_test_image.jpg";
-  private static final String MODEL_PATH = "my_model.lite";
+  if (benchmarkClassification) {
+    ...
+    testImagePath = "my_test_image.jpg";
+    modelPath = "my_model.lite";
+  } else {  // Benchmarking detection.
+  ...
 ```
+If you are adding a detection model, simply modify `modelPath` and `testImagePath` in the else block above.
 
 * Adjust the benchmark parameters when needed:
 
diff --git a/tensorflow/lite/java/src/main/java/org/tensorflow/lite/NativeInterpreterWrapper.java b/tensorflow/lite/java/src/main/java/org/tensorflow/lite/NativeInterpreterWrapper.java
index 64a27739a84..1952db0267b 100644
--- a/tensorflow/lite/java/src/main/java/org/tensorflow/lite/NativeInterpreterWrapper.java
+++ b/tensorflow/lite/java/src/main/java/org/tensorflow/lite/NativeInterpreterWrapper.java
@@ -283,7 +283,7 @@ final class NativeInterpreterWrapper implements AutoCloseable {
 
   /** Gets the number of output tensors. */
   int getOutputTensorCount() {
-    return inputTensors.length;
+    return outputTensors.length;
   }
 
   /**
diff --git a/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterFlexTest.java b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterFlexTest.java
index b22399a4a47..21c431a82bf 100644
--- a/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterFlexTest.java
+++ b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterFlexTest.java
@@ -18,6 +18,8 @@ package org.tensorflow.lite;
 import static com.google.common.truth.Truth.assertThat;
 
 import java.io.File;
+import java.util.HashMap;
+import java.util.Map;
 import org.junit.Test;
 import org.junit.runner.RunWith;
 import org.junit.runners.JUnit4;
@@ -38,9 +40,17 @@ public final class InterpreterFlexTest {
     try (Interpreter interpreter = new Interpreter(FLEX_MODEL_FILE)) {
       assertThat(interpreter.getInputTensorCount()).isEqualTo(4);
       assertThat(interpreter.getInputTensor(0).dataType()).isEqualTo(DataType.FLOAT32);
-      assertThat(interpreter.getOutputTensorCount()).isEqualTo(4);
+      assertThat(interpreter.getInputTensor(1).dataType()).isEqualTo(DataType.FLOAT32);
+      assertThat(interpreter.getInputTensor(2).dataType()).isEqualTo(DataType.FLOAT32);
+      assertThat(interpreter.getInputTensor(3).dataType()).isEqualTo(DataType.FLOAT32);
+      assertThat(interpreter.getOutputTensorCount()).isEqualTo(2);
       assertThat(interpreter.getOutputTensor(0).dataType()).isEqualTo(DataType.FLOAT32);
-      interpreter.run(new float[1], new float[1]);
+      assertThat(interpreter.getOutputTensor(1).dataType()).isEqualTo(DataType.FLOAT32);
+      Object[] inputs = new Object[] {new float[1], new float[1], new float[1], new float[1]};
+      Map<Integer, Object> outputs = new HashMap<>();
+      outputs.put(0, new float[1]);
+      outputs.put(1, new float[1]);
+      interpreter.runForMultipleInputsOutputs(inputs, outputs);
     }
   }
 
diff --git a/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterMobileNetTest.java b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterMobileNetTest.java
new file mode 100644
index 00000000000..b69bfa076e2
--- /dev/null
+++ b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterMobileNetTest.java
@@ -0,0 +1,57 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow.lite;
+
+import static com.google.common.truth.Truth.assertThat;
+
+import java.io.File;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.JUnit4;
+
+/** Unit tests for {@link org.tensorflow.lite.Interpreter} agains a MobileNet model. */
+@RunWith(JUnit4.class)
+public final class InterpreterMobileNetTest {
+
+  private static final File MOBILENET_MODEL_FILE =
+      new File("tensorflow/lite/java/src/testdata/mobilenet.tflite.bin");
+
+  @Test
+  public void testMobilenetRun() {
+    // Create a gray image.
+    float[][][][] img = new float[1][224][224][3];
+    for (int i = 0; i < 224; ++i) {
+      for (int j = 0; j < 224; ++j) {
+        img[0][i][j][0] = 0.5f;
+        img[0][i][j][1] = 0.5f;
+        img[0][i][j][2] = 0.5f;
+      }
+    }
+
+    // Allocate memory to receive the output values.
+    float[][] labels = new float[1][1001];
+
+    Interpreter interpreter = new Interpreter(MOBILENET_MODEL_FILE);
+    interpreter.run(img, labels);
+    assertThat(interpreter.getInputTensor(0).shape()).isEqualTo(new int[] {1, 224, 224, 3});
+    assertThat(interpreter.getOutputTensor(0).shape()).isEqualTo(new int[] {1, 1001});
+    interpreter.close();
+
+    assertThat(labels[0])
+        .usingExactEquality()
+        .containsNoneOf(new float[] {Float.NaN, Float.NEGATIVE_INFINITY, Float.POSITIVE_INFINITY});
+  }
+}
diff --git a/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterTest.java b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterTest.java
index 612229d1727..e635515de8c 100644
--- a/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterTest.java
+++ b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterTest.java
@@ -40,8 +40,8 @@ public final class InterpreterTest {
   private static final File MODEL_FILE =
       new File("tensorflow/lite/java/src/testdata/add.bin");
 
-  private static final File MOBILENET_MODEL_FILE =
-      new File("tensorflow/lite/java/src/testdata/mobilenet.tflite.bin");
+  private static final File MULTIPLE_INPUTS_MODEL_FILE =
+      new File("tensorflow/lite/testdata/multi_add.bin");
 
   private static final File FLEX_MODEL_FILE =
       new File("tensorflow/lite/testdata/multi_add_flex.bin");
@@ -167,20 +167,29 @@ public final class InterpreterTest {
 
   @Test
   public void testRunForMultipleInputsOutputs() {
-    Interpreter interpreter = new Interpreter(MODEL_FILE);
-    float[] oneD = {1.23f, 6.54f, 7.81f};
-    float[][] twoD = {oneD, oneD, oneD, oneD, oneD, oneD, oneD, oneD};
-    float[][][] threeD = {twoD, twoD, twoD, twoD, twoD, twoD, twoD, twoD};
-    float[][][][] fourD = {threeD, threeD};
-    Object[] inputs = {fourD};
-    float[][][][] parsedOutputs = new float[2][8][8][3];
+    Interpreter interpreter = new Interpreter(MULTIPLE_INPUTS_MODEL_FILE);
+    assertThat(interpreter.getInputTensorCount()).isEqualTo(4);
+    assertThat(interpreter.getInputTensor(0).dataType()).isEqualTo(DataType.FLOAT32);
+    assertThat(interpreter.getInputTensor(1).dataType()).isEqualTo(DataType.FLOAT32);
+    assertThat(interpreter.getInputTensor(2).dataType()).isEqualTo(DataType.FLOAT32);
+    assertThat(interpreter.getInputTensor(3).dataType()).isEqualTo(DataType.FLOAT32);
+    assertThat(interpreter.getOutputTensorCount()).isEqualTo(2);
+    assertThat(interpreter.getOutputTensor(0).dataType()).isEqualTo(DataType.FLOAT32);
+    assertThat(interpreter.getOutputTensor(1).dataType()).isEqualTo(DataType.FLOAT32);
+
+    float[] input0 = {1.23f};
+    float[] input1 = {2.43f};
+    Object[] inputs = {input0, input1, input0, input1};
+    float[] parsedOutput0 = new float[1];
+    float[] parsedOutput1 = new float[1];
     Map<Integer, Object> outputs = new HashMap<>();
-    outputs.put(0, parsedOutputs);
+    outputs.put(0, parsedOutput0);
+    outputs.put(1, parsedOutput1);
     interpreter.runForMultipleInputsOutputs(inputs, outputs);
-    float[] outputOneD = parsedOutputs[0][0][0];
-    float[] expected = {3.69f, 19.62f, 23.43f};
-    assertThat(outputOneD).usingTolerance(0.1f).containsExactly(expected).inOrder();
-    interpreter.close();
+    float[] expected0 = {4.89f};
+    float[] expected1 = {6.09f};
+    assertThat(parsedOutput0).usingTolerance(0.1f).containsExactly(expected0).inOrder();
+    assertThat(parsedOutput1).usingTolerance(0.1f).containsExactly(expected1).inOrder();
   }
 
   @Test
@@ -214,32 +223,6 @@ public final class InterpreterTest {
     }
   }
 
-  @Test
-  public void testMobilenetRun() {
-    // Create a gray image.
-    float[][][][] img = new float[1][224][224][3];
-    for (int i = 0; i < 224; ++i) {
-      for (int j = 0; j < 224; ++j) {
-        img[0][i][j][0] = 0.5f;
-        img[0][i][j][1] = 0.5f;
-        img[0][i][j][2] = 0.5f;
-      }
-    }
-
-    // Allocate memory to receive the output values.
-    float[][] labels = new float[1][1001];
-
-    Interpreter interpreter = new Interpreter(MOBILENET_MODEL_FILE);
-    interpreter.run(img, labels);
-    assertThat(interpreter.getInputTensor(0).shape()).isEqualTo(new int[] {1, 224, 224, 3});
-    assertThat(interpreter.getOutputTensor(0).shape()).isEqualTo(new int[] {1, 1001});
-    interpreter.close();
-
-    assertThat(labels[0])
-        .usingExactEquality()
-        .containsNoneOf(new float[] {Float.NaN, Float.NEGATIVE_INFINITY, Float.POSITIVE_INFINITY});
-  }
-
   @Test
   public void testRunWithWrongInputType() {
     Interpreter interpreter = new Interpreter(MODEL_FILE);
@@ -286,7 +269,7 @@ public final class InterpreterTest {
 
   @Test
   public void testGetInputIndex() {
-    Interpreter interpreter = new Interpreter(MOBILENET_MODEL_FILE);
+    Interpreter interpreter = new Interpreter(MODEL_FILE);
     try {
       interpreter.getInputIndex("WrongInputName");
       fail();
@@ -303,7 +286,7 @@ public final class InterpreterTest {
 
   @Test
   public void testGetOutputIndex() {
-    Interpreter interpreter = new Interpreter(MOBILENET_MODEL_FILE);
+    Interpreter interpreter = new Interpreter(MODEL_FILE);
     try {
       interpreter.getOutputIndex("WrongOutputName");
       fail();
@@ -312,9 +295,9 @@ public final class InterpreterTest {
           .hasMessageThat()
           .contains(
               "'WrongOutputName' is not a valid name for any output. Names of outputs and their"
-                  + " indexes are {MobilenetV1/Predictions/Softmax=0}");
+                  + " indexes are {output=0}");
     }
-    int index = interpreter.getOutputIndex("MobilenetV1/Predictions/Softmax");
+    int index = interpreter.getOutputIndex("output");
     assertThat(index).isEqualTo(0);
   }
 
diff --git a/tensorflow/lite/java/src/test/native/BUILD b/tensorflow/lite/java/src/test/native/BUILD
index 27fc95f1f7f..4d3e82b1ac1 100644
--- a/tensorflow/lite/java/src/test/native/BUILD
+++ b/tensorflow/lite/java/src/test/native/BUILD
@@ -21,6 +21,10 @@ cc_library(
             "//tensorflow/lite/java/src/main/native:jni_md.h",
         ],
     }),
+    includes = select({
+        "//tensorflow:android": [],
+        "//conditions:default": ["../../main/native/."],
+    }),
     deps = ["//tensorflow/lite/c:c_api_internal"],
 )
 
diff --git a/tensorflow/lite/java/src/test/native/interpreter_test_jni.cc b/tensorflow/lite/java/src/test/native/interpreter_test_jni.cc
index 4a5c18d9438..1a0072a7c67 100644
--- a/tensorflow/lite/java/src/test/native/interpreter_test_jni.cc
+++ b/tensorflow/lite/java/src/test/native/interpreter_test_jni.cc
@@ -25,6 +25,8 @@ Java_org_tensorflow_lite_InterpreterTest_getNativeHandleForDelegate(
     JNIEnv* env, jclass clazz) {
   // A simple op which outputs a vector of length 1 with the value [7].
   static TfLiteRegistration registration = {
+      .init = nullptr,
+      .free = nullptr,
       .prepare =
           [](TfLiteContext* context, TfLiteNode* node) {
             TfLiteTensor* output = &context->tensors[node->outputs->data[0]];
@@ -38,20 +40,30 @@ Java_org_tensorflow_lite_InterpreterTest_getNativeHandleForDelegate(
             TfLiteTensor* output = &context->tensors[node->outputs->data[0]];
             output->data.f[0] = 7.0f;
             return kTfLiteOk;
-          }};
+          },
+      .profiling_string = nullptr,
+      .builtin_code = 0,
+      .custom_name = "",
+      .version = 1,
+  };
   // A simple delegate which replaces all ops with a single op that outputs a
   // vector of length 1 with the value [7].
   static TfLiteDelegate delegate = {
-      .flags = kTfLiteDelegateFlagsAllowDynamicTensors,
+      .data_ = nullptr,
       .Prepare = [](TfLiteContext* context,
                     TfLiteDelegate* delegate) -> TfLiteStatus {
         TfLiteIntArray* execution_plan;
         TF_LITE_ENSURE_STATUS(
             context->GetExecutionPlan(context, &execution_plan));
-        context->ReplaceSubgraphsWithDelegateKernels(context, registration,
-                                                     execution_plan, delegate);
+        context->ReplaceNodeSubsetsWithDelegateKernels(
+            context, registration, execution_plan, delegate);
         return kTfLiteOk;
-      }};
+      },
+      .CopyFromBufferHandle = nullptr,
+      .CopyToBufferHandle = nullptr,
+      .FreeBufferHandle = nullptr,
+      .flags = kTfLiteDelegateFlagsAllowDynamicTensors,
+  };
   return reinterpret_cast<jlong>(&delegate);
 }
 
@@ -60,10 +72,14 @@ Java_org_tensorflow_lite_InterpreterTest_getNativeHandleForInvalidDelegate(
     JNIEnv* env, jclass clazz) {
   // A simple delegate that fails during preparation.
   static TfLiteDelegate delegate = {
-      .Prepare = [](TfLiteContext* context,
-                    TfLiteDelegate* delegate) -> TfLiteStatus {
-        return kTfLiteError;
-      }};
+      .data_ = nullptr,
+      .Prepare = [](TfLiteContext* context, TfLiteDelegate* delegate)
+          -> TfLiteStatus { return kTfLiteError; },
+      .CopyFromBufferHandle = nullptr,
+      .CopyToBufferHandle = nullptr,
+      .FreeBufferHandle = nullptr,
+      .flags = kTfLiteDelegateFlagsNone,
+  };
   return reinterpret_cast<jlong>(&delegate);
 }
 
diff --git a/tensorflow/lite/kernels/add.cc b/tensorflow/lite/kernels/add.cc
index f4bfd8d3248..32a7c100ce5 100644
--- a/tensorflow/lite/kernels/add.cc
+++ b/tensorflow/lite/kernels/add.cc
@@ -220,30 +220,38 @@ TfLiteStatus EvalAddQuantized(TfLiteContext* context, TfLiteNode* node,
                               const TfLiteTensor* input2,
                               TfLiteTensor* output) {
   if (output->type == kTfLiteUInt8) {
+    tflite::ArithmeticParams op_params;
+    op_params.left_shift = data->left_shift;
+    op_params.input1_offset = data->input1_offset;
+    op_params.input1_multiplier = data->input1_multiplier;
+    op_params.input1_shift = data->input1_shift;
+    op_params.input2_offset = data->input2_offset;
+    op_params.input2_multiplier = data->input2_multiplier;
+    op_params.input2_shift = data->input2_shift;
+    op_params.output_offset = data->output_offset;
+    op_params.output_multiplier = data->output_multiplier;
+    op_params.output_shift = data->output_shift;
+    SetActivationParams(data->output_activation_min,
+                        data->output_activation_max, &op_params);
+    bool need_broadcast = optimized_ops::ProcessBroadcastShapes(
+        GetTensorShape(input1), GetTensorShape(input2), &op_params);
 #define TF_LITE_ADD(type, opname)                                      \
-  tflite::ArithmeticParams op_params;                                  \
-  op_params.left_shift = data->left_shift;                             \
-  op_params.input1_offset = data->input1_offset;                       \
-  op_params.input1_multiplier = data->input1_multiplier;               \
-  op_params.input1_shift = data->input1_shift;                         \
-  op_params.input2_offset = data->input2_offset;                       \
-  op_params.input2_multiplier = data->input2_multiplier;               \
-  op_params.input2_shift = data->input2_shift;                         \
-  op_params.output_offset = data->output_offset;                       \
-  op_params.output_multiplier = data->output_multiplier;               \
-  op_params.output_shift = data->output_shift;                         \
-  SetActivationParams(data->output_activation_min,                     \
-                      data->output_activation_max, &op_params);        \
   type::opname(op_params, GetTensorShape(input1),                      \
                GetTensorData<uint8_t>(input1), GetTensorShape(input2), \
                GetTensorData<uint8_t>(input2), GetTensorShape(output), \
-               GetTensorData<uint8_t>(output))
-    // The quantized version of Add doesn't support activations, so we
-    // always use BroadcastAdd.
+               GetTensorData<uint8_t>(output));
     if (kernel_type == kReference) {
-      TF_LITE_ADD(reference_ops, BroadcastAdd4DSlow);
+      if (need_broadcast) {
+        TF_LITE_ADD(reference_ops, BroadcastAdd4DSlow);
+      } else {
+        TF_LITE_ADD(reference_ops, Add);
+      }
     } else {
-      TF_LITE_ADD(optimized_ops, BroadcastAdd4DSlow);
+      if (need_broadcast) {
+        TF_LITE_ADD(optimized_ops, BroadcastAddFivefold);
+      } else {
+        TF_LITE_ADD(optimized_ops, Add);
+      }
     }
 #undef TF_LITE_ADD
   } else if (output->type == kTfLiteInt16) {
diff --git a/tensorflow/lite/kernels/comparisons_test.cc b/tensorflow/lite/kernels/comparisons_test.cc
index 3c278c1f9e1..ab10c959a4d 100644
--- a/tensorflow/lite/kernels/comparisons_test.cc
+++ b/tensorflow/lite/kernels/comparisons_test.cc
@@ -455,7 +455,7 @@ TEST(ComparisonsTest, LessEqualQuantized) {
 TEST(ComparisonsTest, QuantizedEqualWithBroadcast) {
   const float kMin = -1.f;
   const float kMax = 128.f;
-  std::vector<std::initializer_list<int>> test_shapes = {
+  std::vector<std::vector<int>> test_shapes = {
       {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
   for (int i = 0; i < test_shapes.size(); ++i) {
     ComparisonOpModel model({TensorType_UINT8, test_shapes[i], kMin, kMax},
@@ -473,7 +473,7 @@ TEST(ComparisonsTest, QuantizedEqualWithBroadcast) {
 TEST(ComparisonsTest, QuantizedNotEqualWithBroadcast) {
   const float kMin = -1.f;
   const float kMax = 128.f;
-  std::vector<std::initializer_list<int>> test_shapes = {
+  std::vector<std::vector<int>> test_shapes = {
       {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
   for (int i = 0; i < test_shapes.size(); ++i) {
     ComparisonOpModel model({TensorType_UINT8, test_shapes[i], kMin, kMax},
@@ -491,7 +491,7 @@ TEST(ComparisonsTest, QuantizedNotEqualWithBroadcast) {
 TEST(ComparisonsTest, QuantizedGreaterWithBroadcast) {
   const float kMin = -1.f;
   const float kMax = 128.f;
-  std::vector<std::initializer_list<int>> test_shapes = {
+  std::vector<std::vector<int>> test_shapes = {
       {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
   for (int i = 0; i < test_shapes.size(); ++i) {
     ComparisonOpModel model({TensorType_UINT8, test_shapes[i], kMin, kMax},
@@ -509,7 +509,7 @@ TEST(ComparisonsTest, QuantizedGreaterWithBroadcast) {
 TEST(ComparisonsTest, QuantizedGreaterEqualWithBroadcast) {
   const float kMin = -1.f;
   const float kMax = 128.f;
-  std::vector<std::initializer_list<int>> test_shapes = {
+  std::vector<std::vector<int>> test_shapes = {
       {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
   for (int i = 0; i < test_shapes.size(); ++i) {
     ComparisonOpModel model({TensorType_UINT8, test_shapes[i], kMin, kMax},
@@ -527,7 +527,7 @@ TEST(ComparisonsTest, QuantizedGreaterEqualWithBroadcast) {
 TEST(ComparisonsTest, QuantizedLessWithBroadcast) {
   const float kMin = -1.f;
   const float kMax = 128.f;
-  std::vector<std::initializer_list<int>> test_shapes = {
+  std::vector<std::vector<int>> test_shapes = {
       {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
   for (int i = 0; i < test_shapes.size(); ++i) {
     ComparisonOpModel model({TensorType_UINT8, test_shapes[i], kMin, kMax},
@@ -545,7 +545,7 @@ TEST(ComparisonsTest, QuantizedLessWithBroadcast) {
 TEST(ComparisonsTest, QuantizedLessEqualWithBroadcast) {
   const float kMin = -1.f;
   const float kMax = 128.f;
-  std::vector<std::initializer_list<int>> test_shapes = {
+  std::vector<std::vector<int>> test_shapes = {
       {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
   for (int i = 0; i < test_shapes.size(); ++i) {
     ComparisonOpModel model({TensorType_UINT8, test_shapes[i], kMin, kMax},
diff --git a/tensorflow/lite/kernels/detection_postprocess_test.cc b/tensorflow/lite/kernels/detection_postprocess_test.cc
index d7ffaf1d82b..a1c061a3cad 100644
--- a/tensorflow/lite/kernels/detection_postprocess_test.cc
+++ b/tensorflow/lite/kernels/detection_postprocess_test.cc
@@ -194,7 +194,7 @@ TEST(DetectionPostprocessOpTest, QuantizedTest) {
       {TensorType_FLOAT32, {}}, {TensorType_FLOAT32, {}},
       {TensorType_FLOAT32, {}});
   // six boxes in center-size encoding
-  std::vector<std::initializer_list<float>> inputs1 = {{
+  std::vector<std::vector<float>> inputs1 = {{
       0.0, 0.0,  0.0, 0.0,  // box #1
       0.0, 1.0,  0.0, 0.0,  // box #2
       0.0, -1.0, 0.0, 0.0,  // box #3
@@ -204,12 +204,12 @@ TEST(DetectionPostprocessOpTest, QuantizedTest) {
   }};
   m.QuantizeAndPopulate<uint8_t>(m.input1(), inputs1[0]);
   // class scores - two classes with background
-  std::vector<std::initializer_list<float>> inputs2 = {
-      {0., .9, .8, 0., .75, .72, 0., .6, .5, 0., .93, .95, 0., .5, .4, 0., .3,
-       .2}};
+  std::vector<std::vector<float>> inputs2 = {{0., .9, .8, 0., .75, .72, 0., .6,
+                                              .5, 0., .93, .95, 0., .5, .4, 0.,
+                                              .3, .2}};
   m.QuantizeAndPopulate<uint8_t>(m.input2(), inputs2[0]);
   // six anchors in center-size encoding
-  std::vector<std::initializer_list<float>> inputs3 = {{
+  std::vector<std::vector<float>> inputs3 = {{
       0.5, 0.5,   1.0, 1.0,  // anchor #1
       0.5, 0.5,   1.0, 1.0,  // anchor #2
       0.5, 0.5,   1.0, 1.0,  // anchor #3
@@ -405,7 +405,7 @@ TEST(DetectionPostprocessOpTest, QuantizedTestFastNMS) {
       {TensorType_FLOAT32, {}}, {TensorType_FLOAT32, {}},
       {TensorType_FLOAT32, {}}, false);
   // six boxes in center-size encoding
-  std::vector<std::initializer_list<float>> inputs1 = {{
+  std::vector<std::vector<float>> inputs1 = {{
       0.0, 0.0,  0.0, 0.0,  // box #1
       0.0, 1.0,  0.0, 0.0,  // box #2
       0.0, -1.0, 0.0, 0.0,  // box #3
@@ -415,12 +415,12 @@ TEST(DetectionPostprocessOpTest, QuantizedTestFastNMS) {
   }};
   m.QuantizeAndPopulate<uint8_t>(m.input1(), inputs1[0]);
   // class scores - two classes with background
-  std::vector<std::initializer_list<float>> inputs2 = {
-      {0., .9, .8, 0., .75, .72, 0., .6, .5, 0., .93, .95, 0., .5, .4, 0., .3,
-       .2}};
+  std::vector<std::vector<float>> inputs2 = {{0., .9, .8, 0., .75, .72, 0., .6,
+                                              .5, 0., .93, .95, 0., .5, .4, 0.,
+                                              .3, .2}};
   m.QuantizeAndPopulate<uint8_t>(m.input2(), inputs2[0]);
   // six anchors in center-size encoding
-  std::vector<std::initializer_list<float>> inputs3 = {{
+  std::vector<std::vector<float>> inputs3 = {{
       0.5, 0.5,   1.0, 1.0,  // anchor #1
       0.5, 0.5,   1.0, 1.0,  // anchor #2
       0.5, 0.5,   1.0, 1.0,  // anchor #3
@@ -517,7 +517,7 @@ TEST(DetectionPostprocessOpTest, QuantizedTestRegularNMS) {
       {TensorType_FLOAT32, {}}, {TensorType_FLOAT32, {}},
       {TensorType_FLOAT32, {}}, true);
   // six boxes in center-size encoding
-  std::vector<std::initializer_list<float>> inputs1 = {{
+  std::vector<std::vector<float>> inputs1 = {{
       0.0, 0.0,  0.0, 0.0,  // box #1
       0.0, 1.0,  0.0, 0.0,  // box #2
       0.0, -1.0, 0.0, 0.0,  // box #3
@@ -527,12 +527,12 @@ TEST(DetectionPostprocessOpTest, QuantizedTestRegularNMS) {
   }};
   m.QuantizeAndPopulate<uint8_t>(m.input1(), inputs1[0]);
   // class scores - two classes with background
-  std::vector<std::initializer_list<float>> inputs2 = {
-      {0., .9, .8, 0., .75, .72, 0., .6, .5, 0., .93, .95, 0., .5, .4, 0., .3,
-       .2}};
+  std::vector<std::vector<float>> inputs2 = {{0., .9, .8, 0., .75, .72, 0., .6,
+                                              .5, 0., .93, .95, 0., .5, .4, 0.,
+                                              .3, .2}};
   m.QuantizeAndPopulate<uint8_t>(m.input2(), inputs2[0]);
   // six anchors in center-size encoding
-  std::vector<std::initializer_list<float>> inputs3 = {{
+  std::vector<std::vector<float>> inputs3 = {{
       0.5, 0.5,   1.0, 1.0,  // anchor #1
       0.5, 0.5,   1.0, 1.0,  // anchor #2
       0.5, 0.5,   1.0, 1.0,  // anchor #3
diff --git a/tensorflow/lite/kernels/floor_mod.cc b/tensorflow/lite/kernels/floor_mod.cc
index beddac2174e..878716a5b4a 100644
--- a/tensorflow/lite/kernels/floor_mod.cc
+++ b/tensorflow/lite/kernels/floor_mod.cc
@@ -81,7 +81,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_TYPES_EQ(context, input1->type, input2->type);
 
   const TfLiteType type = input1->type;
-  if (type != kTfLiteInt32 && type != kTfLiteFloat32) {
+  if (type != kTfLiteInt32 && type != kTfLiteFloat32 && type != kTfLiteInt64) {
     context->ReportError(context, "Type '%s' is not supported by floor_mod.",
                          TfLiteTypeGetName(type));
     return kTfLiteError;
@@ -107,7 +107,7 @@ TfLiteStatus EvalImpl(TfLiteContext* context, bool requires_broadcast,
                       TfLiteTensor* output) {
   const T* denominator_data = GetTensorData<T>(input2);
 
-  if (input2->type == kTfLiteInt32) {
+  if (input2->type == kTfLiteInt32 || input2->type == kTfLiteInt64) {
     // Validate the denominator only for integer.
     const int num_elements = NumElements(input2);
     for (int i = 0; i < num_elements; ++i) {
@@ -144,6 +144,10 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       return EvalImpl<int32_t>(context, data->requires_broadcast, input1,
                                input2, output);
     }
+    case kTfLiteInt64: {
+      return EvalImpl<int64_t>(context, data->requires_broadcast, input1,
+                               input2, output);
+    }
     case kTfLiteFloat32: {
       return EvalImpl<float>(context, data->requires_broadcast, input1, input2,
                              output);
diff --git a/tensorflow/lite/kernels/floor_mod_test.cc b/tensorflow/lite/kernels/floor_mod_test.cc
index 9d75f5ce2e3..9d78673f320 100644
--- a/tensorflow/lite/kernels/floor_mod_test.cc
+++ b/tensorflow/lite/kernels/floor_mod_test.cc
@@ -80,6 +80,17 @@ TEST(FloorModModel, BroadcastFloorMod) {
   EXPECT_THAT(model.GetOutput(), ElementsAre(-2, 0, -2, -2));
 }
 
+TEST(FloorModModel, Int64WithBroadcast) {
+  FloorModModel<int64_t> model({TensorType_INT64, {1, 2, 2, 1}},
+                               {TensorType_INT64, {1}}, {TensorType_INT64, {}});
+  model.PopulateTensor<int64_t>(model.input1(), {10, -9, -11, (1LL << 34) + 9});
+  model.PopulateTensor<int64_t>(model.input2(), {-(1LL << 33)});
+  model.Invoke();
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(1, 2, 2, 1));
+  EXPECT_THAT(model.GetOutput(),
+              ElementsAre(-8589934582, -9, -11, -8589934583));
+}
+
 TEST(FloorModModel, FloatSimple) {
   FloorModModel<float> model({TensorType_FLOAT32, {1, 2, 2, 1}},
                              {TensorType_FLOAT32, {1, 2, 2, 1}},
diff --git a/tensorflow/lite/kernels/gather.cc b/tensorflow/lite/kernels/gather.cc
index 195a6d2b81b..61884d6a12c 100644
--- a/tensorflow/lite/kernels/gather.cc
+++ b/tensorflow/lite/kernels/gather.cc
@@ -38,18 +38,28 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   const TfLiteTensor* input = GetInput(context, node, kInputTensor);
   const TfLiteTensor* positions = GetInput(context, node, kInputPositions);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
-  // Only INT32 positions are supported.
-  TF_LITE_ENSURE_EQ(context, positions->type, kTfLiteInt32);
+
+  switch (positions->type) {
+    case kTfLiteInt64:
+    case kTfLiteInt32:
+      break;
+    default:
+      context->ReportError(
+          context, "Positions of type '%s' are not supported by gather.",
+          TfLiteTypeGetName(positions->type));
+      return kTfLiteError;
+  }
+
   // Assign to output the input type.
   output->type = input->type;
+
   // Check conditions for different types.
   switch (input->type) {
     case kTfLiteFloat32:
     case kTfLiteUInt8:
-    case kTfLiteInt32: {
-      // Fully supported by reference_ops::Gather.
-    } break;
-
+    case kTfLiteInt64:
+    case kTfLiteInt32:
+      break;
     case kTfLiteString: {
       // Only 1D input is supported.
       TF_LITE_ENSURE_EQ(context, NumDimensions(input), 1);
@@ -82,51 +92,83 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   return context->ResizeTensor(context, output, output_shape);
 }
 
+template <typename InputT, typename PositionsT>
+TfLiteStatus Gather(const TfLiteGatherParams& params, const TfLiteTensor* input,
+                    const TfLiteTensor* positions, TfLiteTensor* output) {
+  tflite::GatherParams op_params;
+  op_params.axis = params.axis;
+  optimized_ops::Gather(op_params, GetTensorShape(input),
+                        GetTensorData<InputT>(input), GetTensorShape(positions),
+                        GetTensorData<PositionsT>(positions),
+                        GetTensorShape(output), GetTensorData<InputT>(output));
+  return kTfLiteOk;
+}
+
+template <typename PositionT>
+TfLiteStatus GatherStrings(TfLiteContext* context, const TfLiteTensor* input,
+                           const TfLiteTensor* positions,
+                           TfLiteTensor* output) {
+  // TODO(mgubin): Currently support only for 1D output tensors.
+  DynamicBuffer buffer;
+  const PositionT* indexes = GetTensorData<PositionT>(positions);
+  const PositionT num_strings = GetStringCount(input);
+  for (int i = 0; i < positions->dims->data[0]; ++i) {
+    const PositionT pos = indexes[i];
+    TF_LITE_ENSURE(context, pos < num_strings);
+    const auto string_ref = GetString(input, pos);
+    buffer.AddString(string_ref.str, string_ref.len);
+  }
+  buffer.WriteToTensor(output);
+  return kTfLiteOk;
+}
+
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   const auto* params =
       reinterpret_cast<const TfLiteGatherParams*>(node->builtin_data);
   const TfLiteTensor* input = GetInput(context, node, kInputTensor);
   const TfLiteTensor* positions = GetInput(context, node, kInputPositions);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
-  const int input_rank = NumDimensions(input);
-#define TF_LITE_GATHER(data_type, index_type)                              \
-  {                                                                        \
-    tflite::GatherParams op_params;                                        \
-    op_params.input_rank = input_rank;                                     \
-    op_params.axis = params->axis;                                         \
-    optimized_ops::Gather(                                                 \
-        op_params, GetTensorShape(input), GetTensorData<data_type>(input), \
-        GetTensorShape(positions), GetTensorData<index_type>(positions),   \
-        GetTensorShape(output), GetTensorData<data_type>(output));         \
+
+  if (positions->type == kTfLiteInt32) {
+    switch (input->type) {
+      case kTfLiteFloat32:
+        return Gather<float, int32_t>(*params, input, positions, output);
+      case kTfLiteUInt8:
+        return Gather<uint8_t, int32_t>(*params, input, positions, output);
+      case kTfLiteInt32:
+        return Gather<int32_t, int32_t>(*params, input, positions, output);
+      case kTfLiteInt64:
+        return Gather<int64_t, int32_t>(*params, input, positions, output);
+      case kTfLiteString:
+        return GatherStrings<int32_t>(context, input, positions, output);
+      default:
+        context->ReportError(context, "Type '%s' is not supported by gather.",
+                             TfLiteTypeGetName(input->type));
+        return kTfLiteError;
+    }
   }
-  switch (input->type) {
-    case kTfLiteFloat32:
-      TF_LITE_GATHER(float, int32_t);
-      break;
-    case kTfLiteUInt8:
-      TF_LITE_GATHER(uint8_t, int32_t);
-      break;
-    case kTfLiteInt32:
-      TF_LITE_GATHER(int32_t, int32_t);
-      break;
-    case kTfLiteString: {
-      // TODO(mgubin): Currently support only for 1D output tensors.
-      DynamicBuffer buffer;
-      const int32* indexes = positions->data.i32;
-      const int num_strings = GetStringCount(input);
-      for (int i = 0; i < positions->dims->data[0]; ++i) {
-        const int pos = indexes[i];
-        TF_LITE_ENSURE(context, pos < num_strings);
-        const auto string_ref = GetString(input, pos);
-        buffer.AddString(string_ref.str, string_ref.len);
-      }
-      buffer.WriteToTensor(output);
-    } break;
-    default:
-      return kTfLiteError;
+  if (positions->type == kTfLiteInt64) {
+    switch (input->type) {
+      case kTfLiteFloat32:
+        return Gather<float, int64_t>(*params, input, positions, output);
+      case kTfLiteUInt8:
+        return Gather<uint8_t, int64_t>(*params, input, positions, output);
+      case kTfLiteInt32:
+        return Gather<int32_t, int64_t>(*params, input, positions, output);
+      case kTfLiteInt64:
+        return Gather<int64_t, int64_t>(*params, input, positions, output);
+      case kTfLiteString:
+        return GatherStrings<int64_t>(context, input, positions, output);
+      default:
+        context->ReportError(context, "Type '%s' is not supported by gather.",
+                             TfLiteTypeGetName(input->type));
+        return kTfLiteError;
+    }
   }
-#undef TF_LITE_GATHER
-  return kTfLiteOk;
+  context->ReportError(context,
+                       "Positions of type '%s' are not supported by gather.",
+                       TfLiteTypeGetName(positions->type));
+  return kTfLiteError;
 }
 }  // namespace gather
 
diff --git a/tensorflow/lite/kernels/gather_test.cc b/tensorflow/lite/kernels/gather_test.cc
index 58460f847fa..7b5f8434890 100644
--- a/tensorflow/lite/kernels/gather_test.cc
+++ b/tensorflow/lite/kernels/gather_test.cc
@@ -26,37 +26,36 @@ using ::testing::ElementsAreArray;
 
 class GatherOpModel : public SingleOpModel {
  public:
-  GatherOpModel(std::initializer_list<int> input_shape, TensorType input_type,
-                std::initializer_list<int> positions_shape, int axis = 0) {
-    input_ = AddInput(input_type);
-    positions_ = AddInput(TensorType_INT32);
-    output_ = AddOutput(input_type);
+  GatherOpModel(const TensorData& input, const TensorData& positions,
+                int axis = 0) {
+    input_ = AddInput(input);
+    positions_ = AddInput(positions);
+    output_ = AddOutput(input.type);
     SetBuiltinOp(BuiltinOperator_GATHER, BuiltinOptions_GatherOptions,
                  CreateGatherOptions(builder_, axis).Union());
-    BuildInterpreter({input_shape, positions_shape});
+    BuildInterpreter({GetShape(input_), GetShape(positions_)});
   }
 
-  void SetInputFloat(std::initializer_list<float> data) {
-    PopulateTensor<float>(input_, data);
+  template <typename T>
+  void SetInput(std::initializer_list<T> data) {
+    PopulateTensor<T>(input_, data);
   }
 
-  void SetInputUint8(std::initializer_list<uint8_t> data) {
-    PopulateTensor<uint8_t>(input_, data);
-  }
-
-  void SetInput(std::initializer_list<string> data) {
+  void SetStringInput(std::initializer_list<string> data) {
     PopulateStringTensor(input_, data);
   }
 
-  void SetPositions(std::initializer_list<int> data) {
-    PopulateTensor<int>(positions_, data);
+  template <typename T>
+  void SetPositions(std::initializer_list<T> data) {
+    PopulateTensor<T>(positions_, data);
   }
 
-  std::vector<float> GetOutputFloat() { return ExtractVector<float>(output_); }
-  std::vector<uint8_t> GetOutputUint8() {
-    return ExtractVector<uint8_t>(output_);
+  template <typename T>
+  std::vector<T> GetOutput() {
+    return ExtractVector<T>(output_);
   }
-  std::vector<string> GetOutputString() {
+
+  std::vector<string> GetStringOutput() {
     return ExtractVector<string>(output_);
   }
   std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
@@ -68,99 +67,171 @@ class GatherOpModel : public SingleOpModel {
 };
 
 TEST(GatherOpTest, Shuffle) {
-  GatherOpModel m({2, 2}, TensorType_FLOAT32, {2});
-  m.SetInputFloat({-2.0, 0.2, 0.7, 0.8});
-  m.SetPositions({1, 0});
+  GatherOpModel m({TensorType_FLOAT32, {2, 2}}, {TensorType_INT32, {2}});
+  m.SetInput<float>({-2.0, 0.2, 0.7, 0.8});
+  m.SetPositions<int32_t>({1, 0});
   m.Invoke();
-  EXPECT_THAT(m.GetOutputFloat(),
+  EXPECT_THAT(m.GetOutput<float>(),
               ElementsAreArray(ArrayFloatNear({0.7, 0.8, -2, 0.2})));
 }
 
 TEST(GatherOpTest, Test0DIndex) {
-  GatherOpModel m({2, 2}, TensorType_FLOAT32, {});
-  m.SetInputFloat({-2.0, 0.2, 0.7, 0.8});
-  m.SetPositions({1});
+  GatherOpModel m({TensorType_FLOAT32, {2, 2}}, {TensorType_INT32, {}});
+  m.SetInput<float>({-2.0, 0.2, 0.7, 0.8});
+  m.SetPositions<int32_t>({1});
   m.Invoke();
-  EXPECT_THAT(m.GetOutputFloat(), ElementsAreArray(ArrayFloatNear({0.7, 0.8})));
+  EXPECT_THAT(m.GetOutput<float>(),
+              ElementsAreArray(ArrayFloatNear({0.7, 0.8})));
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2}));
 }
 
 TEST(GatherOpTest, Test0DIndexWith0DResult) {
   // 0D tensor is special case in current TFLite. Test it once to make sure
   // existing workarounds are fine with it.
-  GatherOpModel m({3}, TensorType_FLOAT32, {});
-  m.SetInputFloat({1.0, 2.0, 3.0});
-  m.SetPositions({1});
+  GatherOpModel m({TensorType_FLOAT32, {3}}, {TensorType_INT32, {}});
+  m.SetInput<float>({1.0, 2.0, 3.0});
+  m.SetPositions<int32_t>({1});
   m.Invoke();
-  EXPECT_THAT(m.GetOutputFloat(), ElementsAreArray(ArrayFloatNear({2.0})));
+  EXPECT_THAT(m.GetOutput<float>(), ElementsAreArray(ArrayFloatNear({2.0})));
   EXPECT_TRUE(m.GetOutputShape().empty());
 }
 
 TEST(GatherOpTest, Test2DIndexWith2DResult) {
-  GatherOpModel m({3}, TensorType_FLOAT32, {1, 2});
-  m.SetInputFloat({1.0, 2.0, 3.0});
-  m.SetPositions({1, 0});
+  GatherOpModel m({TensorType_FLOAT32, {3}}, {TensorType_INT32, {1, 2}});
+  m.SetInput<float>({1.0, 2.0, 3.0});
+  m.SetPositions<int32_t>({1, 0});
   m.Invoke();
-  EXPECT_THAT(m.GetOutputFloat(), ElementsAreArray(ArrayFloatNear({2.0, 1.0})));
+  EXPECT_THAT(m.GetOutput<float>(),
+              ElementsAreArray(ArrayFloatNear({2.0, 1.0})));
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 2}));
 }
 
 TEST(FloatGatherOpTest, Duplicate) {
-  GatherOpModel m({1, 2, 2}, TensorType_FLOAT32, {2});
-  m.SetInputFloat({-2.0, 0.2, 0.7, 0.8});
-  m.SetPositions({0, 0});
+  GatherOpModel m({TensorType_FLOAT32, {1, 2, 2}}, {TensorType_INT32, {2}});
+  m.SetInput<float>({-2.0, 0.2, 0.7, 0.8});
+  m.SetPositions<int32_t>({0, 0});
   m.Invoke();
   EXPECT_THAT(
-      m.GetOutputFloat(),
+      m.GetOutput<float>(),
       ElementsAreArray(ArrayFloatNear({-2, 0.2, 0.7, 0.8, -2, 0.2, 0.7, 0.8})));
 }
 
 TEST(FloatGatherOpTest, Slice) {
-  GatherOpModel m({4, 1}, TensorType_FLOAT32, {2});
-  m.SetInputFloat({-2.0, 0.2, 0.7, 0.8});
-  m.SetPositions({1, 3});
+  GatherOpModel m({TensorType_FLOAT32, {4, 1}}, {TensorType_INT32, {2}});
+  m.SetInput<float>({-2.0, 0.2, 0.7, 0.8});
+  m.SetPositions<int32_t>({1, 3});
   m.Invoke();
-  EXPECT_THAT(m.GetOutputFloat(), ElementsAreArray(ArrayFloatNear({0.2, 0.8})));
+  EXPECT_THAT(m.GetOutput<float>(),
+              ElementsAreArray(ArrayFloatNear({0.2, 0.8})));
 }
 
 TEST(FloatGatherOpTest, Axis1) {
   const int axis = 1;
-  GatherOpModel m({1, 2, 3}, TensorType_FLOAT32, {2}, axis);
-  m.SetInputFloat({1, 2, 3, 4, 5, 6});
-  m.SetPositions({1, 0});
+  GatherOpModel m({TensorType_FLOAT32, {1, 2, 3}}, {TensorType_INT32, {2}},
+                  axis);
+  m.SetInput<float>({1, 2, 3, 4, 5, 6});
+  m.SetPositions<int32_t>({1, 0});
   m.Invoke();
-  EXPECT_THAT(m.GetOutputFloat(),
+  EXPECT_THAT(m.GetOutput<float>(),
               ElementsAreArray(ArrayFloatNear({4, 5, 6, 1, 2, 3})));
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 2, 3}));
 }
 
 TEST(FloatGatherOpTest, LastAxis) {
   const int axis = -1;
-  GatherOpModel m({1, 2, 3}, TensorType_FLOAT32, {2}, axis);
-  m.SetInputFloat({1, 2, 3, 4, 5, 6});
-  m.SetPositions({2, 0});
+  GatherOpModel m({TensorType_FLOAT32, {1, 2, 3}}, {TensorType_INT32, {2}},
+                  axis);
+  m.SetInput<float>({1, 2, 3, 4, 5, 6});
+  m.SetPositions<int32_t>({2, 0});
   m.Invoke();
-  EXPECT_THAT(m.GetOutputFloat(),
+  EXPECT_THAT(m.GetOutput<float>(),
               ElementsAreArray(ArrayFloatNear({3, 1, 6, 4})));
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 2, 2}));
 }
 
-TEST(Uint8tGatherOpTest, Shuffle) {
-  GatherOpModel m({2, 2}, TensorType_UINT8, {2});
-  m.SetInputUint8({133, 134, 14, 15});
-  m.SetPositions({1, 0});
+TEST(TypesGatherOpTest, Float32Int32) {
+  GatherOpModel m({TensorType_FLOAT32, {2, 2}}, {TensorType_INT32, {2}});
+  m.SetInput<float>({13.3, -13.4, -1.4, 1.5});
+  m.SetPositions<int32_t>({1, 0});
   m.Invoke();
 
-  EXPECT_THAT(m.GetOutputUint8(), ElementsAreArray({14, 15, 133, 134}));
+  EXPECT_THAT(m.GetOutput<float>(), ElementsAreArray({-1.4, 1.5, 13.3, -13.4}));
+}
+
+TEST(TypesGatherOpTest, Float32Int64) {
+  GatherOpModel m({TensorType_FLOAT32, {2, 2}}, {TensorType_INT64, {2}});
+  m.SetInput<float>({13.3, -13.4, -1.4, 1.5});
+  m.SetPositions<int64_t>({1LL, 0LL});
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput<float>(), ElementsAreArray({-1.4, 1.5, 13.3, -13.4}));
+}
+
+TEST(TypesGatherOpTest, Int32Int32) {
+  GatherOpModel m({TensorType_INT32, {2, 2}}, {TensorType_INT32, {2}});
+  m.SetInput<int32_t>({-1330, 1340, 140, -150});
+  m.SetPositions<int32_t>({1, 0});
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput<int32_t>(),
+              ElementsAreArray({140, -150, -1330, 1340}));
+}
+
+TEST(TypesGatherOpTest, Int32Int64) {
+  GatherOpModel m({TensorType_INT32, {2, 2}}, {TensorType_INT64, {2}});
+  m.SetInput<int32_t>({-1330, 1340, 140, -150});
+  m.SetPositions<int64_t>({1LL, 0LL});
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput<int32_t>(),
+              ElementsAreArray({140, -150, -1330, 1340}));
+}
+
+TEST(TypesGatherOpTest, Uint8Int32) {
+  GatherOpModel m({TensorType_UINT8, {2, 2}}, {TensorType_INT32, {2}});
+  m.SetInput<uint8_t>({133, 134, 14, 15});
+  m.SetPositions<int32_t>({1, 0});
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput<uint8_t>(), ElementsAreArray({14, 15, 133, 134}));
+}
+
+TEST(TypesGatherOpTest, Uint8Int64) {
+  GatherOpModel m({TensorType_UINT8, {2, 2}}, {TensorType_INT64, {2}});
+  m.SetInput<uint8_t>({133, 134, 14, 15});
+  m.SetPositions<int64_t>({1LL, 0LL});
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput<uint8_t>(), ElementsAreArray({14, 15, 133, 134}));
+}
+
+TEST(TypesGatherOpTest, Int64Int32) {
+  GatherOpModel m({TensorType_INT64, {2, 2}}, {TensorType_INT32, {2}});
+  m.SetInput<int64_t>({-(1LL << 34), 134LL, 14LL, 15LL});
+  m.SetPositions<int32_t>({1, 0});
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput<int64_t>(),
+              ElementsAreArray({14LL, 15LL, -(1LL << 34), 134LL}));
+}
+
+TEST(TypesGatherOpTest, Int64Int64) {
+  GatherOpModel m({TensorType_INT64, {2, 2}}, {TensorType_INT64, {2}});
+  m.SetInput<int64_t>({-(1LL << 34), 134LL, 14LL, 15LL});
+  m.SetPositions<int64_t>({1LL, 0LL});
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput<int64_t>(),
+              ElementsAreArray({14LL, 15LL, -(1LL << 34), 134LL}));
 }
 
 TEST(GatherOpTest, SimpleString) {
-  GatherOpModel m({3}, TensorType_STRING, {2});
-  m.SetInput({"A", "B", "C"});
-  m.SetPositions({0, 2});
+  GatherOpModel m({TensorType_STRING, {3}}, {TensorType_INT32, {2}});
+  m.SetStringInput({"A", "B", "C"});
+  m.SetPositions<int32_t>({0, 2});
   m.Invoke();
   ASSERT_THAT(m.GetOutputShape(), ElementsAreArray({2}));
-  EXPECT_THAT(m.GetOutputString(), ElementsAreArray({"A", "C"}));
+  EXPECT_THAT(m.GetStringOutput(), ElementsAreArray({"A", "C"}));
 }
 }  // namespace
 }  // namespace tflite
diff --git a/tensorflow/lite/kernels/internal/BUILD b/tensorflow/lite/kernels/internal/BUILD
index 32f61e02807..6d9690ea460 100644
--- a/tensorflow/lite/kernels/internal/BUILD
+++ b/tensorflow/lite/kernels/internal/BUILD
@@ -591,6 +591,7 @@ cc_test(
 cc_test(
     name = "depthwiseconv_quantized_test",
     srcs = ["depthwiseconv_quantized_test.cc"],
+    shard_count = 2,
     tags = [
         "no_oss",
         "tflite_not_portable_ios",
diff --git a/tensorflow/lite/kernels/internal/common.h b/tensorflow/lite/kernels/internal/common.h
index e31f47d2cea..fdb72037f84 100644
--- a/tensorflow/lite/kernels/internal/common.h
+++ b/tensorflow/lite/kernels/internal/common.h
@@ -27,7 +27,7 @@ limitations under the License.
 #include <arm_neon.h>
 #endif
 
-#if defined __GNUC__ && defined __SSE4_1__
+#if defined __GNUC__ && defined __SSE4_1__ && !defined TF_LITE_DISABLE_X86_NEON
 #define USE_NEON
 
 #define OPTIMIZED_OPS_H__IGNORE_DEPRECATED_DECLARATIONS
diff --git a/tensorflow/lite/kernels/internal/optimized/eigen_tensor_reduced_instantiations_google.h b/tensorflow/lite/kernels/internal/optimized/eigen_tensor_reduced_instantiations_google.h
index f71ddbf3220..6461a5e5426 100644
--- a/tensorflow/lite/kernels/internal/optimized/eigen_tensor_reduced_instantiations_google.h
+++ b/tensorflow/lite/kernels/internal/optimized/eigen_tensor_reduced_instantiations_google.h
@@ -12,25 +12,55 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_EIGEN_TENSOR_REDUCED_INSTANTIATIONS_GOOGLE_H_
-#define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_EIGEN_TENSOR_REDUCED_INSTANTIATIONS_GOOGLE_H_
 
-#define EIGEN_USE_CUSTOM_THREAD_POOL
-#define EIGEN_USE_THREADS
+// This is essentially unsupported/CXX11/Eigen/Tensor.h
+// TODO(petewarden) - move this to a common location in Eigen itself.
 
 // clang-format off
 
-#include <stdint.h>
 
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_EIGEN_TENSOR_REDUCED_INSTANTIATIONS_GOOGLE_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_EIGEN_TENSOR_REDUCED_INSTANTIATIONS_GOOGLE_H_
+
+
+#include "Eigen/Core"
+
+#if defined(EIGEN_USE_SYCL)
+#undef min
+#undef max
+#undef isnan
+#undef isinf
+#undef isfinite
+#include <CL/sycl.hpp>
+#include <iostream>
+#include <map>
+#include <memory>
+#include <utility>
+#endif
+#include <cmath>
 #include <cstddef>
 #include <cstring>
-#include <cmath>
+
+
+
+
+
+#ifdef _WIN32
+typedef __int16 int16_t;
+typedef unsigned __int16 uint16_t;
+typedef __int32 int32_t;
+typedef unsigned __int32 uint32_t;
+typedef __int64 int64_t;
+typedef unsigned __int64 uint64_t;
+#include <windows.h>
+#else
+#include <stdint.h>
+#include <unistd.h>
+#endif
+
+#if __cplusplus > 199711 || EIGEN_COMP_MSVC >= 1900
 #include <random>
-#include <atomic>
-#include <condition_variable>  // NOLINT(build/c++11)
-#include <mutex>  // NOLINT(build/c++11)
-#include <thread>  // NOLINT(build/c++11)
-#include <functional>
+#endif
 
 #ifdef _WIN32
 #include <windows.h>
@@ -40,58 +70,53 @@ limitations under the License.
 #include <time.h>
 #endif
 
+// #if defined(EIGEN_USE_LIBXSMM)
+// #include "libxsmm.h"
+// #endif
 
-// Because some programs may link Eigen in through other frameworks with
-// different flags, we can run into multiple definition issues if we don't have
-// a private namespace for our versions. This is a nasty hack, but a similar
-// approach is used elsewhere to handle the problem, so it should be stable.
-#define Eigen EigenForTFLite
+#ifdef EIGEN_USE_THREADS
+#include "third_party/eigen3/unsupported/Eigen/CXX11/ThreadPool"
+#endif
 
-#include "Eigen/src/Core/util/StaticAssert.h"
-#include "unsupported/Eigen/CXX11/Core"
-#include "unsupported/Eigen/SpecialFunctions"
 
 #include "Eigen/src/Core/util/DisableStupidWarnings.h"
 
-#include "Eigen/Core"
+#include "third_party/eigen3/unsupported/Eigen/SpecialFunctions"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/util/CXX11Meta.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/util/MaxSizeVector.h"
 
-// Beware: the order of the include matters to some compilers. For example
-// TensorIndexList.h should be included before TensorDimensions.h in order to
-// use index lists to encode tensor dimensions when compiling with llvm.
-// We're defining this ourselves rather than using the Eigen Tensor header file
-// so that we can alter the macro definition of TENSOR_CONTRACTION_DISPATCH to
-// reduce binary size.
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h"
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h"
-#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/ThreadPoolInterface.h"
-#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h"
-#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorNonBlockingThreadPool.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorDimensionList.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h"
-#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h"
-#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorGlobalFunctions.h"
-
-#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorStats.h"
-
 #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h"
-
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h"
-
 #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h"
-#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMappers.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h"
+
 #undef TENSOR_CONTRACTION_DISPATCH
 #define TENSOR_CONTRACTION_DISPATCH(METHOD, ALIGNMENT, ARGS)    \
   if (this->m_lhs_inner_dim_contiguous &&                       \
@@ -102,8 +127,9 @@ limitations under the License.
     eigen_assert(false && "Unsupported contraction formats");   \
   }
 
+
 #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h"
-#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h"
@@ -125,19 +151,18 @@ limitations under the License.
 #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h"
-
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h"
-
 #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/Tensor.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h"
-
-#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h"
-
 #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorIO.h"
 
 #include "Eigen/src/Core/util/ReenableStupidWarnings.h"
+
+
 #endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_EIGEN_TENSOR_REDUCED_INSTANTIATIONS_GOOGLE_H_
diff --git a/tensorflow/lite/kernels/internal/optimized/eigen_tensor_reduced_instantiations_oss.h b/tensorflow/lite/kernels/internal/optimized/eigen_tensor_reduced_instantiations_oss.h
index 5e83b7b846e..f5576fbff70 100644
--- a/tensorflow/lite/kernels/internal/optimized/eigen_tensor_reduced_instantiations_oss.h
+++ b/tensorflow/lite/kernels/internal/optimized/eigen_tensor_reduced_instantiations_oss.h
@@ -94,7 +94,7 @@ typedef unsigned __int64 uint64_t;
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h"
-#include "unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorDimensionList.h"
@@ -106,10 +106,11 @@ typedef unsigned __int64 uint64_t;
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorGlobalFunctions.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorBase.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h"
-#include "unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h"
@@ -128,7 +129,7 @@ typedef unsigned __int64 uint64_t;
 
 
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h"
-#include "unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h"
diff --git a/tensorflow/lite/kernels/internal/optimized/optimized_ops.h b/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
index 6f7031b36d2..6f2cd4faab2 100644
--- a/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
+++ b/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
@@ -70,6 +70,7 @@ using reference_ops::LessEqual;
 using reference_ops::LessEqualWithScaling;
 using reference_ops::LessWithScaling;
 using reference_ops::Mean;
+using reference_ops::ProcessBroadcastShapes;
 using reference_ops::RankOneSelect;
 using reference_ops::Relu1;
 using reference_ops::Relu6;
@@ -3151,12 +3152,12 @@ inline void LstmCell(
   // Combined memory state and final output calculation
   gemmlowp::ScopedProfilingLabel label2("MemoryStateAndFinalOutput");
   output_state_map =
-      input_gate_sm.unaryExpr(Eigen::internal::scalar_sigmoid_op<float>()) *
+      input_gate_sm.unaryExpr(Eigen::internal::scalar_logistic_op<float>()) *
           new_input_sm.tanh() +
-      forget_gate_sm.unaryExpr(Eigen::internal::scalar_sigmoid_op<float>()) *
+      forget_gate_sm.unaryExpr(Eigen::internal::scalar_logistic_op<float>()) *
           prev_state_map;
   output_activ_map =
-      output_gate_sm.unaryExpr(Eigen::internal::scalar_sigmoid_op<float>()) *
+      output_gate_sm.unaryExpr(Eigen::internal::scalar_logistic_op<float>()) *
       output_state_map.tanh();
 }
 
@@ -4367,7 +4368,7 @@ inline void Logistic(const RuntimeShape& input_shape, const float* input_data,
   auto input_map = MapAsVector(input_data, input_shape);
   auto output_map = MapAsVector(output_data, output_shape);
   output_map.array() =
-      input_map.array().unaryExpr(Eigen::internal::scalar_sigmoid_op<float>());
+      input_map.array().unaryExpr(Eigen::internal::scalar_logistic_op<float>());
 }
 
 // Convenience version that allows, for example, generated-code calls to be
diff --git a/tensorflow/lite/kernels/internal/reference/legacy_reference_ops.h b/tensorflow/lite/kernels/internal/reference/legacy_reference_ops.h
index c92f28c79ef..380fc8f98eb 100644
--- a/tensorflow/lite/kernels/internal/reference/legacy_reference_ops.h
+++ b/tensorflow/lite/kernels/internal/reference/legacy_reference_ops.h
@@ -802,7 +802,6 @@ inline void Gather(const T* input_data, const Dims<4>& input_dims,
                    const Dims<4>& coords_dims, T* output_data,
                    const Dims<4>& output_dims) {
   tflite::GatherParams op_params;
-  op_params.input_rank = input_rank;
   op_params.axis = 4 - input_rank;
 
   Gather(op_params, DimsToShape(input_dims), input_data,
diff --git a/tensorflow/lite/kernels/internal/reference/reference_ops.h b/tensorflow/lite/kernels/internal/reference/reference_ops.h
index b1fefbef04c..1bd9129488a 100644
--- a/tensorflow/lite/kernels/internal/reference/reference_ops.h
+++ b/tensorflow/lite/kernels/internal/reference/reference_ops.h
@@ -100,6 +100,98 @@ gemmlowp::FixedPoint<tRawType, tIntegerBits> SaturatingSub(
 
 namespace reference_ops {
 
+// Return true for broadcast case, false otherwise.
+inline bool ProcessBroadcastShapes(const RuntimeShape& shape0,
+                                   const RuntimeShape& shape1,
+                                   tflite::ArithmeticParams* params) {
+  const int dims_count =
+      std::max(shape0.DimensionsCount(), shape1.DimensionsCount());
+
+  params->broadcast_category = BroadcastableOpCategory::kGenericBroadcast;
+  RuntimeShape scalar_shape(dims_count, 1);
+
+  auto extended_shape0 = RuntimeShape::ExtendedShape(dims_count, shape0);
+  auto extended_shape1 = RuntimeShape::ExtendedShape(dims_count, shape1);
+
+  // Check for "exact" match, implicitly accepting any scalar shapes.
+  if (extended_shape0 == extended_shape1) {
+    params->broadcast_category = BroadcastableOpCategory::kNonBroadcast;
+    return false;
+  }
+
+  for (int i = dims_count - 1; i >= 0; --i) {
+    if (extended_shape0.Dims(i) == extended_shape1.Dims(i)) {
+      continue;
+    } else if (extended_shape0.Dims(i) == 1) {
+      params->broadcast_category =
+          BroadcastableOpCategory::kFirstInputBroadcastsFast;
+      break;
+    } else if (extended_shape1.Dims(i) == 1) {
+      params->broadcast_category =
+          BroadcastableOpCategory::kSecondInputBroadcastsFast;
+      break;
+    } else {
+      params->broadcast_category = BroadcastableOpCategory::kGenericBroadcast;
+      break;
+    }
+  }
+
+  if (params->broadcast_category !=
+          BroadcastableOpCategory::kFirstInputBroadcastsFast &&
+      params->broadcast_category !=
+          BroadcastableOpCategory::kSecondInputBroadcastsFast) {
+    return false;
+  }
+
+  // From this point it is assumed contractually that corresponding dimensions
+  // in shape0 and shape1 are either (a) equal or (b) one or other equals 1.
+  const bool swap_inputs = params->broadcast_category ==
+                           BroadcastableOpCategory::kSecondInputBroadcastsFast;
+  const RuntimeShape* shape_a =
+      swap_inputs ? &extended_shape1 : &extended_shape0;
+  const RuntimeShape* shape_b =
+      swap_inputs ? &extended_shape0 : &extended_shape1;
+
+  int i = dims_count - 1;
+  params->broadcast_shape[0] = 1;
+  params->broadcast_shape[1] = 1;
+  params->broadcast_shape[2] = 1;
+  params->broadcast_shape[3] = 1;
+  params->broadcast_shape[4] = 1;
+  // y_0 is greedy: include dims if both or neither equal 1: in other words,
+  // test for equality rather than (shape_a->Dims(i) != 1).
+  while (i >= 0 && shape_a->Dims(i) == shape_b->Dims(i)) {
+    params->broadcast_shape[4] *= shape_b->Dims(i);
+    --i;
+  }
+  // Here either input_a or input_b has dim of 1 (if i >= 0).  If it is input_b
+  // that has the unit dimension, the next two loops are not entered.
+  while (i >= 0 && shape_a->Dims(i) == 1) {
+    params->broadcast_shape[3] *= shape_b->Dims(i);
+    --i;
+  }
+  while (i >= 0 && shape_a->Dims(i) == shape_b->Dims(i)) {
+    params->broadcast_shape[2] *= shape_a->Dims(i);
+    --i;
+  }
+  // Here either input_a or input_b has dim of 1 (if i >= 0).
+  while (i >= 0 && shape_b->Dims(i) == 1) {
+    params->broadcast_shape[1] *= shape_a->Dims(i);
+    --i;
+  }
+  while (i >= 0 && shape_a->Dims(i) == shape_b->Dims(i)) {
+    params->broadcast_shape[0] *= shape_b->Dims(i);
+    --i;
+  }
+
+  // Rarer case is when the broadcast dimensions cannot be handled by a fivefold
+  // loop.
+  if (i >= 0) {
+    params->broadcast_category = BroadcastableOpCategory::kGenericBroadcast;
+  }
+  return true;
+}
+
 template <typename T>
 int CountLeadingZeros(T integer_input) {
   static_assert(std::is_unsigned<T>::value,
@@ -2936,10 +3028,10 @@ inline void Floor(const RuntimeShape& input_shape, const float* input_data,
   }
 }
 
-template <typename T>
+template <typename T, typename CoordsT = int32>
 inline void Gather(const tflite::GatherParams& op_params,
                    const RuntimeShape& input_shape, const T* input_data,
-                   const RuntimeShape& coords_shape, const int32* coords_data,
+                   const RuntimeShape& coords_shape, const CoordsT* coords_data,
                    const RuntimeShape& output_shape, T* output_data) {
   int axis = op_params.axis;
   if (axis < 0) {
diff --git a/tensorflow/lite/kernels/internal/types.h b/tensorflow/lite/kernels/internal/types.h
index 04b95ddc63d..a05bd5e0033 100644
--- a/tensorflow/lite/kernels/internal/types.h
+++ b/tensorflow/lite/kernels/internal/types.h
@@ -852,7 +852,6 @@ struct FullyConnectedParams {
 };
 
 struct GatherParams {
-  int16 input_rank;
   int16 axis;
 };
 
diff --git a/tensorflow/lite/kernels/mul.cc b/tensorflow/lite/kernels/mul.cc
index b405dee47ef..01039a70543 100644
--- a/tensorflow/lite/kernels/mul.cc
+++ b/tensorflow/lite/kernels/mul.cc
@@ -153,26 +153,34 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
                            const TfLiteTensor* input2, TfLiteTensor* output) {
   if (input1->type == kTfLiteUInt8 && input2->type == kTfLiteUInt8 &&
       output->type == kTfLiteUInt8) {
+    tflite::ArithmeticParams op_params;
+    SetActivationParams(data->output_activation_min,
+                        data->output_activation_max, &op_params);
+    op_params.input1_offset = -input1->params.zero_point;
+    op_params.input2_offset = -input2->params.zero_point;
+    op_params.output_offset = output->params.zero_point;
+    op_params.output_multiplier = data->output_multiplier;
+    op_params.output_shift = data->output_shift;
+    bool need_broadcast = optimized_ops::ProcessBroadcastShapes(
+        GetTensorShape(input1), GetTensorShape(input2), &op_params);
 #define TF_LITE_MUL(type, opname)                                      \
-  tflite::ArithmeticParams op_params;                                  \
-  SetActivationParams(data->output_activation_min,                     \
-                      data->output_activation_max, &op_params);        \
-  op_params.input1_offset = -input1->params.zero_point;                \
-  op_params.input2_offset = -input2->params.zero_point;                \
-  op_params.output_offset = output->params.zero_point;                 \
-  op_params.output_multiplier = data->output_multiplier;               \
-  op_params.output_shift = data->output_shift;                         \
   type::opname(op_params, GetTensorShape(input1),                      \
                GetTensorData<uint8_t>(input1), GetTensorShape(input2), \
                GetTensorData<uint8_t>(input2), GetTensorShape(output), \
                GetTensorData<uint8_t>(output))
 
-    // The quantized version of Mul doesn't support activations, so we
-    // always use BroadcastMul.
     if (kernel_type == kReference) {
-      TF_LITE_MUL(reference_ops, BroadcastMul4DSlow);
+      if (need_broadcast) {
+        TF_LITE_MUL(reference_ops, BroadcastMul4DSlow);
+      } else {
+        TF_LITE_MUL(reference_ops, Mul);
+      }
     } else {
-      TF_LITE_MUL(optimized_ops, BroadcastMul4DSlow);
+      if (need_broadcast) {
+        TF_LITE_MUL(optimized_ops, BroadcastMulFivefold);
+      } else {
+        TF_LITE_MUL(optimized_ops, Mul);
+      }
     }
 #undef TF_LITE_MUL
   } else if (input1->type == kTfLiteInt16 && input2->type == kTfLiteInt16 &&
diff --git a/tensorflow/lite/kernels/pack.cc b/tensorflow/lite/kernels/pack.cc
index 24fabccde09..479495c875d 100644
--- a/tensorflow/lite/kernels/pack.cc
+++ b/tensorflow/lite/kernels/pack.cc
@@ -40,7 +40,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   // TODO(renjieliu): Support negative axis.
   TF_LITE_ENSURE(context, data->axis >= 0);
   if (input0->type != kTfLiteInt32 && input0->type != kTfLiteFloat32 &&
-      input0->type != kTfLiteUInt8 && input0->type != kTfLiteInt16) {
+      input0->type != kTfLiteUInt8 && input0->type != kTfLiteInt16 &&
+      input0->type != kTfLiteInt64) {
     context->ReportError(context, "Type '%s' is not supported by pack.",
                          TfLiteTypeGetName(input0->type));
     return kTfLiteError;
@@ -110,6 +111,10 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       PackImpl<int32_t>(context, node, output, data->values_count, data->axis);
       break;
     }
+    case kTfLiteInt64: {
+      PackImpl<int64_t>(context, node, output, data->values_count, data->axis);
+      break;
+    }
     default: {
       context->ReportError(context, "Type '%s' is not supported by pack.",
                            TfLiteTypeGetName(output->type));
diff --git a/tensorflow/lite/kernels/pack_test.cc b/tensorflow/lite/kernels/pack_test.cc
index a47e9ff40d0..4f58debc5c8 100644
--- a/tensorflow/lite/kernels/pack_test.cc
+++ b/tensorflow/lite/kernels/pack_test.cc
@@ -113,6 +113,40 @@ TEST(PackOpTest, Int32MultilDimensions) {
               ElementsAreArray({1, 2, 3, 7, 8, 9, 4, 5, 6, 10, 11, 12}));
 }
 
+// int64 tests.
+TEST(PackOpTest, Int64ThreeInputs) {
+  PackOpModel<int64_t> model({TensorType_INT64, {2}}, 0, 3);
+  model.SetInput(0, {1LL << 33, 4});
+  model.SetInput(1, {2, 5});
+  model.SetInput(2, {3, -(1LL << 34)});
+  model.Invoke();
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(3, 2));
+  EXPECT_THAT(model.GetOutput(),
+              ElementsAreArray({1LL << 33, 4LL, 2LL, 5LL, 3LL, -(1LL << 34)}));
+}
+
+TEST(PackOpTest, Int64ThreeInputsDifferentAxis) {
+  PackOpModel<int64_t> model({TensorType_INT64, {2}}, 1, 3);
+  model.SetInput(0, {1LL << 33, 4});
+  model.SetInput(1, {2, 5});
+  model.SetInput(2, {3, -(1LL << 34)});
+  model.Invoke();
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(2, 3));
+  EXPECT_THAT(model.GetOutput(),
+              ElementsAreArray({1LL << 33, 2LL, 3LL, 4LL, 5LL, -(1LL << 34)}));
+}
+
+TEST(PackOpTest, Int64MultilDimensions) {
+  PackOpModel<int64_t> model({TensorType_INT64, {2, 3}}, 1, 2);
+  model.SetInput(0, {1LL << 33, 2, 3, 4, 5, 6});
+  model.SetInput(1, {7, 8, -(1LL << 34), 10, 11, 12});
+  model.Invoke();
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(2, 2, 3));
+  EXPECT_THAT(model.GetOutput(),
+              ElementsAreArray({1LL << 33, 2LL, 3LL, 7LL, 8LL, -(1LL << 34),
+                                4LL, 5LL, 6LL, 10LL, 11LL, 12LL}));
+}
+
 // uint8
 TEST(PackOpTest, Uint8ThreeInputs) {
   PackOpModel<uint8_t> model({TensorType_UINT8, {2}}, 0, 3);
diff --git a/tensorflow/lite/kernels/sub_test.cc b/tensorflow/lite/kernels/sub_test.cc
index f0b9447ff61..41503300ab5 100644
--- a/tensorflow/lite/kernels/sub_test.cc
+++ b/tensorflow/lite/kernels/sub_test.cc
@@ -99,7 +99,7 @@ TEST(FloatSubOpModel, ActivationRELU_N1_TO_1) {
 }
 
 TEST(FloatSubOpModel, VariousInputShapes) {
-  std::vector<std::initializer_list<int>> test_shapes = {
+  std::vector<std::vector<int>> test_shapes = {
       {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
   for (int i = 0; i < test_shapes.size(); ++i) {
     FloatSubOpModel m({TensorType_FLOAT32, test_shapes[i]},
@@ -116,7 +116,7 @@ TEST(FloatSubOpModel, VariousInputShapes) {
 }
 
 TEST(FloatSubOpModel, WithBroadcast) {
-  std::vector<std::initializer_list<int>> test_shapes = {
+  std::vector<std::vector<int>> test_shapes = {
       {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
   for (int i = 0; i < test_shapes.size(); ++i) {
     FloatSubOpModel m({TensorType_FLOAT32, test_shapes[i]},
@@ -153,7 +153,7 @@ TEST(IntegerSubOpModel, ActivationRELU_N1_TO_1) {
 }
 
 TEST(IntegerSubOpModel, VariousInputShapes) {
-  std::vector<std::initializer_list<int>> test_shapes = {
+  std::vector<std::vector<int>> test_shapes = {
       {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
   for (int i = 0; i < test_shapes.size(); ++i) {
     IntegerSubOpModel m({TensorType_INT32, test_shapes[i]},
@@ -168,7 +168,7 @@ TEST(IntegerSubOpModel, VariousInputShapes) {
 }
 
 TEST(IntegerSubOpModel, WithBroadcast) {
-  std::vector<std::initializer_list<int>> test_shapes = {
+  std::vector<std::vector<int>> test_shapes = {
       {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
   for (int i = 0; i < test_shapes.size(); ++i) {
     IntegerSubOpModel m({TensorType_INT32, test_shapes[i]},
@@ -185,14 +185,13 @@ TEST(IntegerSubOpModel, WithBroadcast) {
 
 TEST(QuantizedSubOpModel, QuantizedTestsNoActivation) {
   float kQuantizedTolerance = GetTolerance(-1.0, 1.0);
-  std::vector<std::initializer_list<float>> inputs1 = {
+  std::vector<std::vector<float>> inputs1 = {
       {0.1, 0.2, 0.3, 0.4}, {-0.2, 0.2, 0.4, 0.7}, {-0.01, 0.2, 0.7, 0.3}};
-  std::vector<std::initializer_list<float>> inputs2 = {
+  std::vector<std::vector<float>> inputs2 = {
       {0.6, 0.4, 0.3, 0.1}, {0.6, 0.4, 0.5, -0.2}, {0.6, 0.4, -0.18, 0.5}};
-  std::vector<std::initializer_list<float>> results = {
-      {-0.5, -0.2, 0.0, 0.3},
-      {-0.8, -0.2, -0.1, 0.9},
-      {-0.61, -0.2, 0.88, -0.2}};
+  std::vector<std::vector<float>> results = {{-0.5, -0.2, 0.0, 0.3},
+                                             {-0.8, -0.2, -0.1, 0.9},
+                                             {-0.61, -0.2, 0.88, -0.2}};
   for (int i = 0; i < inputs1.size(); ++i) {
     QuantizedSubOpModel m({TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0},
                           {TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0},
@@ -209,12 +208,12 @@ TEST(QuantizedSubOpModel, QuantizedTestsNoActivation) {
 
 TEST(QuantizedSubOpModel, QuantizedTestsActivationRELU_N1_TO_1) {
   float kQuantizedTolerance = GetTolerance(-1.0, 1.0);
-  std::vector<std::initializer_list<float>> inputs1 = {{-0.8, 0.2, 0.9, 0.7},
-                                                       {-0.8, 0.2, 0.7, 0.5}};
-  std::vector<std::initializer_list<float>> inputs2 = {{0.6, 0.4, 0.9, -0.8},
-                                                       {0.6, 0.4, -0.8, 0.3}};
-  std::vector<std::initializer_list<float>> results = {{-1.0, -0.2, 0.0, 1.0},
-                                                       {-1.0, -0.2, 1.0, 0.2}};
+  std::vector<std::vector<float>> inputs1 = {{-0.8, 0.2, 0.9, 0.7},
+                                             {-0.8, 0.2, 0.7, 0.5}};
+  std::vector<std::vector<float>> inputs2 = {{0.6, 0.4, 0.9, -0.8},
+                                             {0.6, 0.4, -0.8, 0.3}};
+  std::vector<std::vector<float>> results = {{-1.0, -0.2, 0.0, 1.0},
+                                             {-1.0, -0.2, 1.0, 0.2}};
   for (int i = 0; i < inputs1.size(); ++i) {
     QuantizedSubOpModel m({TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0},
                           {TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0},
@@ -231,7 +230,7 @@ TEST(QuantizedSubOpModel, QuantizedTestsActivationRELU_N1_TO_1) {
 
 TEST(QuantizedSubOpModel, QuantizedVariousInputShapes) {
   float kQuantizedTolerance = GetTolerance(-3.0, 3.0);
-  std::vector<std::initializer_list<int>> test_shapes = {
+  std::vector<std::vector<int>> test_shapes = {
       {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
   for (int i = 0; i < test_shapes.size(); ++i) {
     QuantizedSubOpModel m({TensorType_UINT8, test_shapes[i], -3.0, 3.0},
@@ -250,7 +249,7 @@ TEST(QuantizedSubOpModel, QuantizedVariousInputShapes) {
 
 TEST(QuantizedSubOpModel, QuantizedWithBroadcast) {
   float kQuantizedTolerance = GetTolerance(-3.0, 3.0);
-  std::vector<std::initializer_list<int>> test_shapes = {
+  std::vector<std::vector<int>> test_shapes = {
       {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
   for (int i = 0; i < test_shapes.size(); ++i) {
     QuantizedSubOpModel m({TensorType_UINT8, test_shapes[i], -3.0, 3.0},
diff --git a/tensorflow/lite/kernels/tile.cc b/tensorflow/lite/kernels/tile.cc
index 6d13f9e92f9..1b747974743 100644
--- a/tensorflow/lite/kernels/tile.cc
+++ b/tensorflow/lite/kernels/tile.cc
@@ -182,6 +182,9 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
     case kTfLiteInt64:
       Tile<int64_t>(*(input->dims), input, multipliers, output);
       break;
+    case kTfLiteBool:
+      Tile<bool>(*(input->dims), input, multipliers, output);
+      break;
     default:
       context->ReportError(context, "Type '%s' is not supported by tile.",
                            TfLiteTypeGetName(output->type));
diff --git a/tensorflow/lite/kernels/tile_test.cc b/tensorflow/lite/kernels/tile_test.cc
index d12a7c19a36..a88ff66f075 100644
--- a/tensorflow/lite/kernels/tile_test.cc
+++ b/tensorflow/lite/kernels/tile_test.cc
@@ -34,34 +34,18 @@ class TileOpModel : public SingleOpModel {
     BuildInterpreter({input_shape, {static_cast<int>(input_shape.size())}});
   }
 
-  void SetInputFloat(std::initializer_list<float> data) {
-    PopulateTensor<float>(input_, data);
-  }
-
-  void SetInputUInt8(std::initializer_list<uint8_t> data) {
-    PopulateTensor<uint8_t>(input_, data);
-  }
-
-  void SetInputInt32(std::initializer_list<int32_t> data) {
-    PopulateTensor<int32_t>(input_, data);
-  }
-
-  void SetInputInt64(std::initializer_list<int64_t> data) {
-    PopulateTensor<int64_t>(input_, data);
+  template <typename T>
+  void SetInput(std::initializer_list<T> data) {
+    PopulateTensor<T>(input_, data);
   }
 
   void SetMultipliers(std::initializer_list<int32_t> data) {
     PopulateTensor<int32_t>(multipliers_, data);
   }
 
-  std::vector<float> GetOutputFloat() { return ExtractVector<float>(output_); }
-
-  std::vector<uint8_t> GetOutputUInt8() { return ExtractVector<uint8_t>(output_); }
-
-  std::vector<int32_t> GetOutputInt32() { return ExtractVector<int32_t>(output_); }
-
-  std::vector<int64_t> GetOutputInt64() {
-    return ExtractVector<int64_t>(output_);
+  template <typename T>
+  std::vector<T> GetOutput() {
+    return ExtractVector<T>(output_);
   }
 
   std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
@@ -74,16 +58,16 @@ class TileOpModel : public SingleOpModel {
 
 TEST(TileTest, Float32Vector) {
   TileOpModel m({3}, TensorType_FLOAT32, TensorType_INT32);
-  m.SetInputFloat({1.f, 2.f, 3.f});
+  m.SetInput<float>({1.f, 2.f, 3.f});
   m.SetMultipliers({2});
   m.Invoke();
-  EXPECT_THAT(m.GetOutputFloat(),
+  EXPECT_THAT(m.GetOutput<float>(),
               ElementsAreArray({1.f, 2.f, 3.f, 1.f, 2.f, 3.f}));
 }
 
 TEST(TileTest, Float32Matrix) {
   TileOpModel m({2, 3}, TensorType_FLOAT32, TensorType_INT32);
-  m.SetInputFloat({
+  m.SetInput<float>({
       11.f,
       12.f,
       13.f,
@@ -93,26 +77,26 @@ TEST(TileTest, Float32Matrix) {
   });
   m.SetMultipliers({2, 1});
   m.Invoke();
-  EXPECT_THAT(m.GetOutputFloat(), ElementsAreArray({
-                                      11.f,
-                                      12.f,
-                                      13.f,
-                                      21.f,
-                                      22.f,
-                                      23.f,
-                                      11.f,
-                                      12.f,
-                                      13.f,
-                                      21.f,
-                                      22.f,
-                                      23.f,
-                                  }));
+  EXPECT_THAT(m.GetOutput<float>(), ElementsAreArray({
+                                        11.f,
+                                        12.f,
+                                        13.f,
+                                        21.f,
+                                        22.f,
+                                        23.f,
+                                        11.f,
+                                        12.f,
+                                        13.f,
+                                        21.f,
+                                        22.f,
+                                        23.f,
+                                    }));
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({4, 3}));
 }
 
 TEST(TileTest, Float32HighDimension) {
   TileOpModel m({1, 2, 3}, TensorType_FLOAT32, TensorType_INT32);
-  m.SetInputFloat({
+  m.SetInput<float>({
       11.f,
       12.f,
       13.f,
@@ -123,7 +107,7 @@ TEST(TileTest, Float32HighDimension) {
   m.SetMultipliers({2, 3, 1});
   m.Invoke();
   EXPECT_THAT(
-      m.GetOutputFloat(),
+      m.GetOutput<float>(),
       ElementsAreArray({11.f, 12.f, 13.f, 21.f, 22.f, 23.f, 11.f, 12.f, 13.f,
                         21.f, 22.f, 23.f, 11.f, 12.f, 13.f, 21.f, 22.f, 23.f,
                         11.f, 12.f, 13.f, 21.f, 22.f, 23.f, 11.f, 12.f, 13.f,
@@ -133,7 +117,7 @@ TEST(TileTest, Float32HighDimension) {
 
 TEST(TileTest, Uint8Matrix) {
   TileOpModel m({2, 3}, TensorType_UINT8, TensorType_INT32);
-  m.SetInputUInt8({
+  m.SetInput<uint8_t>({
       11,
       12,
       13,
@@ -143,26 +127,26 @@ TEST(TileTest, Uint8Matrix) {
   });
   m.SetMultipliers({2, 1});
   m.Invoke();
-  EXPECT_THAT(m.GetOutputUInt8(), ElementsAreArray({
-                                      11,
-                                      12,
-                                      13,
-                                      21,
-                                      22,
-                                      23,
-                                      11,
-                                      12,
-                                      13,
-                                      21,
-                                      22,
-                                      23,
-                                  }));
+  EXPECT_THAT(m.GetOutput<uint8_t>(), ElementsAreArray({
+                                          11,
+                                          12,
+                                          13,
+                                          21,
+                                          22,
+                                          23,
+                                          11,
+                                          12,
+                                          13,
+                                          21,
+                                          22,
+                                          23,
+                                      }));
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({4, 3}));
 }
 
 TEST(TileTest, Int32Matrix) {
   TileOpModel m({2, 3}, TensorType_INT32, TensorType_INT32);
-  m.SetInputInt32({
+  m.SetInput<int32_t>({
       11,
       12,
       13,
@@ -172,26 +156,39 @@ TEST(TileTest, Int32Matrix) {
   });
   m.SetMultipliers({2, 1});
   m.Invoke();
-  EXPECT_THAT(m.GetOutputInt32(), ElementsAreArray({
-                                      11,
-                                      12,
-                                      13,
-                                      21,
-                                      22,
-                                      23,
-                                      11,
-                                      12,
-                                      13,
-                                      21,
-                                      22,
-                                      23,
-                                  }));
+  EXPECT_THAT(m.GetOutput<int32_t>(), ElementsAreArray({
+                                          11,
+                                          12,
+                                          13,
+                                          21,
+                                          22,
+                                          23,
+                                          11,
+                                          12,
+                                          13,
+                                          21,
+                                          22,
+                                          23,
+                                      }));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({4, 3}));
+}
+
+TEST(TileTest, BooleanMatrix) {
+  TileOpModel m({2, 3}, TensorType_BOOL, TensorType_INT32);
+  m.SetInput<bool>({true, false, false, true, true, false});
+  m.SetMultipliers({2, 1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput<bool>(),
+              ElementsAreArray({
+                  true, false, false, true, true, false,  // first tiletrue,
+                  true, false, false, true, true, false   // second tile
+              }));
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({4, 3}));
 }
 
 TEST(TileTest, Int64Matrix) {
   TileOpModel m({2, 3}, TensorType_INT64, TensorType_INT32);
-  m.SetInputInt64({
+  m.SetInput<int64_t>({
       11,
       12,
       13,
@@ -201,26 +198,26 @@ TEST(TileTest, Int64Matrix) {
   });
   m.SetMultipliers({2, 1});
   m.Invoke();
-  EXPECT_THAT(m.GetOutputInt64(), ElementsAreArray({
-                                      11,
-                                      12,
-                                      13,
-                                      21,
-                                      22,
-                                      23,
-                                      11,
-                                      12,
-                                      13,
-                                      21,
-                                      22,
-                                      23,
-                                  }));
+  EXPECT_THAT(m.GetOutput<int64_t>(), ElementsAreArray({
+                                          11,
+                                          12,
+                                          13,
+                                          21,
+                                          22,
+                                          23,
+                                          11,
+                                          12,
+                                          13,
+                                          21,
+                                          22,
+                                          23,
+                                      }));
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({4, 3}));
 }
 
 TEST(TileTest, Int64Matrix64Multipliers) {
   TileOpModel m({2, 3}, TensorType_INT64, TensorType_INT64);
-  m.SetInputInt64({
+  m.SetInput<int64_t>({
       11,
       12,
       13,
@@ -230,20 +227,20 @@ TEST(TileTest, Int64Matrix64Multipliers) {
   });
   m.SetMultipliers({2, 1});
   m.Invoke();
-  EXPECT_THAT(m.GetOutputInt64(), ElementsAreArray({
-                                      11,
-                                      12,
-                                      13,
-                                      21,
-                                      22,
-                                      23,
-                                      11,
-                                      12,
-                                      13,
-                                      21,
-                                      22,
-                                      23,
-                                  }));
+  EXPECT_THAT(m.GetOutput<int64_t>(), ElementsAreArray({
+                                          11,
+                                          12,
+                                          13,
+                                          21,
+                                          22,
+                                          23,
+                                          11,
+                                          12,
+                                          13,
+                                          21,
+                                          22,
+                                          23,
+                                      }));
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({4, 3}));
 }
 }  // namespace
diff --git a/tensorflow/lite/optional_debug_tools.cc b/tensorflow/lite/optional_debug_tools.cc
index 020d1d8de5f..5ee1cf6d33d 100644
--- a/tensorflow/lite/optional_debug_tools.cc
+++ b/tensorflow/lite/optional_debug_tools.cc
@@ -83,26 +83,27 @@ void PrintInterpreterState(Interpreter* interpreter) {
   printf("Outputs:");
   PrintIntVector(interpreter->outputs());
   printf("\n");
-  for (int tensor_index = 0; tensor_index < interpreter->tensors_size();
+  for (size_t tensor_index = 0; tensor_index < interpreter->tensors_size();
        tensor_index++) {
-    TfLiteTensor* tensor = interpreter->tensor(tensor_index);
-    printf("Tensor %3d %-20s %10s %15s %10zu bytes (%4.1f MB) ", tensor_index,
+    TfLiteTensor* tensor = interpreter->tensor(static_cast<int>(tensor_index));
+    printf("Tensor %3zu %-20s %10s %15s %10zu bytes (%4.1f MB) ", tensor_index,
            tensor->name, TensorTypeName(tensor->type),
            AllocTypeName(tensor->allocation_type), tensor->bytes,
            (static_cast<float>(tensor->bytes) / (1 << 20)));
     PrintTfLiteIntVector(tensor->dims);
   }
   printf("\n");
-  for (int node_index = 0; node_index < interpreter->nodes_size();
+  for (size_t node_index = 0; node_index < interpreter->nodes_size();
        node_index++) {
     const std::pair<TfLiteNode, TfLiteRegistration>* node_and_reg =
-        interpreter->node_and_registration(node_index);
+        interpreter->node_and_registration(static_cast<int>(node_index));
     const TfLiteNode& node = node_and_reg->first;
     const TfLiteRegistration& reg = node_and_reg->second;
     if (reg.custom_name != nullptr) {
-      printf("Node %3d Operator Custom Name %s\n", node_index, reg.custom_name);
+      printf("Node %3zu Operator Custom Name %s\n", node_index,
+             reg.custom_name);
     } else {
-      printf("Node %3d Operator Builtin Code %3d\n", node_index,
+      printf("Node %3zu Operator Builtin Code %3d\n", node_index,
              reg.builtin_code);
     }
     printf("  Inputs:");
diff --git a/tensorflow/lite/schema/schema.fbs b/tensorflow/lite/schema/schema.fbs
index 07b16f28121..9b0eae74c3b 100644
--- a/tensorflow/lite/schema/schema.fbs
+++ b/tensorflow/lite/schema/schema.fbs
@@ -39,16 +39,34 @@ enum TensorType : byte {
   BOOL = 6,
   INT16 = 7,
   COMPLEX64 = 8,
+  INT8 = 9,
 }
 
-// Parameters for converting a quantized tensor back to float. Given a
-// quantized value q, the corresponding float value f should be:
-//   f = scale * (q - zero_point)
+// Custom quantization parameters for experimenting with new quantization
+// techniques.
+table CustomQuantization {
+  custom:[byte];
+}
+
+// Represents a specific quantization technique's parameters.
+union QuantizationDetails {
+  CustomQuantization,
+}
+
+// Parameters for converting a quantized tensor back to float.
 table QuantizationParameters {
+  // These four parameters are the asymmetric linear quantization parameters.
+  // Given a quantized value q, the corresponding float value f should be:
+  //   f = scale * (q - zero_point)
+  // For other quantization types, the QuantizationDetails below is used.
   min:[float];  // For importing back into tensorflow.
   max:[float];  // For importing back into tensorflow.
   scale:[float];  // For dequantizing the tensor's values.
   zero_point:[long];
+
+  // If this is not none, the quantization parameters above are ignored and the
+  // value of the QuantizationDetails union below should be used.
+  details:QuantizationDetails;
 }
 
 table Tensor {
diff --git a/tensorflow/lite/schema/schema_generated.h b/tensorflow/lite/schema/schema_generated.h
index 479c0d658ba..b7885cfcc50 100755
--- a/tensorflow/lite/schema/schema_generated.h
+++ b/tensorflow/lite/schema/schema_generated.h
@@ -22,6 +22,9 @@ limitations under the License.
 
 namespace tflite {
 
+struct CustomQuantization;
+struct CustomQuantizationT;
+
 struct QuantizationParameters;
 struct QuantizationParametersT;
 
@@ -275,11 +278,12 @@ enum TensorType {
   TensorType_BOOL = 6,
   TensorType_INT16 = 7,
   TensorType_COMPLEX64 = 8,
+  TensorType_INT8 = 9,
   TensorType_MIN = TensorType_FLOAT32,
-  TensorType_MAX = TensorType_COMPLEX64
+  TensorType_MAX = TensorType_INT8
 };
 
-inline const TensorType (&EnumValuesTensorType())[9] {
+inline const TensorType (&EnumValuesTensorType())[10] {
   static const TensorType values[] = {
     TensorType_FLOAT32,
     TensorType_FLOAT16,
@@ -289,7 +293,8 @@ inline const TensorType (&EnumValuesTensorType())[9] {
     TensorType_STRING,
     TensorType_BOOL,
     TensorType_INT16,
-    TensorType_COMPLEX64
+    TensorType_COMPLEX64,
+    TensorType_INT8
   };
   return values;
 }
@@ -305,6 +310,7 @@ inline const char * const *EnumNamesTensorType() {
     "BOOL",
     "INT16",
     "COMPLEX64",
+    "INT8",
     nullptr
   };
   return names;
@@ -315,6 +321,87 @@ inline const char *EnumNameTensorType(TensorType e) {
   return EnumNamesTensorType()[index];
 }
 
+enum QuantizationDetails {
+  QuantizationDetails_NONE = 0,
+  QuantizationDetails_CustomQuantization = 1,
+  QuantizationDetails_MIN = QuantizationDetails_NONE,
+  QuantizationDetails_MAX = QuantizationDetails_CustomQuantization
+};
+
+inline const QuantizationDetails (&EnumValuesQuantizationDetails())[2] {
+  static const QuantizationDetails values[] = {
+    QuantizationDetails_NONE,
+    QuantizationDetails_CustomQuantization
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesQuantizationDetails() {
+  static const char * const names[] = {
+    "NONE",
+    "CustomQuantization",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameQuantizationDetails(QuantizationDetails e) {
+  const size_t index = static_cast<int>(e);
+  return EnumNamesQuantizationDetails()[index];
+}
+
+template<typename T> struct QuantizationDetailsTraits {
+  static const QuantizationDetails enum_value = QuantizationDetails_NONE;
+};
+
+template<> struct QuantizationDetailsTraits<CustomQuantization> {
+  static const QuantizationDetails enum_value = QuantizationDetails_CustomQuantization;
+};
+
+struct QuantizationDetailsUnion {
+  QuantizationDetails type;
+  void *value;
+
+  QuantizationDetailsUnion() : type(QuantizationDetails_NONE), value(nullptr) {}
+  QuantizationDetailsUnion(QuantizationDetailsUnion&& u) FLATBUFFERS_NOEXCEPT :
+    type(QuantizationDetails_NONE), value(nullptr)
+    { std::swap(type, u.type); std::swap(value, u.value); }
+  QuantizationDetailsUnion(const QuantizationDetailsUnion &) FLATBUFFERS_NOEXCEPT;
+  QuantizationDetailsUnion &operator=(const QuantizationDetailsUnion &u) FLATBUFFERS_NOEXCEPT
+    { QuantizationDetailsUnion t(u); std::swap(type, t.type); std::swap(value, t.value); return *this; }
+  QuantizationDetailsUnion &operator=(QuantizationDetailsUnion &&u) FLATBUFFERS_NOEXCEPT
+    { std::swap(type, u.type); std::swap(value, u.value); return *this; }
+  ~QuantizationDetailsUnion() { Reset(); }
+
+  void Reset();
+
+#ifndef FLATBUFFERS_CPP98_STL
+  template <typename T>
+  void Set(T&& val) {
+    Reset();
+    type = QuantizationDetailsTraits<typename T::TableType>::enum_value;
+    if (type != QuantizationDetails_NONE) {
+      value = new T(std::forward<T>(val));
+    }
+  }
+#endif  // FLATBUFFERS_CPP98_STL
+
+  static void *UnPack(const void *obj, QuantizationDetails type, const flatbuffers::resolver_function_t *resolver);
+  flatbuffers::Offset<void> Pack(flatbuffers::FlatBufferBuilder &_fbb, const flatbuffers::rehasher_function_t *_rehasher = nullptr) const;
+
+  CustomQuantizationT *AsCustomQuantization() {
+    return type == QuantizationDetails_CustomQuantization ?
+      reinterpret_cast<CustomQuantizationT *>(value) : nullptr;
+  }
+  const CustomQuantizationT *AsCustomQuantization() const {
+    return type == QuantizationDetails_CustomQuantization ?
+      reinterpret_cast<const CustomQuantizationT *>(value) : nullptr;
+  }
+};
+
+bool VerifyQuantizationDetails(flatbuffers::Verifier &verifier, const void *obj, QuantizationDetails type);
+bool VerifyQuantizationDetailsVector(flatbuffers::Verifier &verifier, const flatbuffers::Vector<flatbuffers::Offset<void>> *values, const flatbuffers::Vector<uint8_t> *types);
+
 enum BuiltinOperator {
   BuiltinOperator_ADD = 0,
   BuiltinOperator_AVERAGE_POOL_2D = 1,
@@ -2024,12 +2111,75 @@ inline const char *EnumNameCustomOptionsFormat(CustomOptionsFormat e) {
   return EnumNamesCustomOptionsFormat()[index];
 }
 
+struct CustomQuantizationT : public flatbuffers::NativeTable {
+  typedef CustomQuantization TableType;
+  std::vector<int8_t> custom;
+  CustomQuantizationT() {
+  }
+};
+
+struct CustomQuantization FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef CustomQuantizationT NativeTableType;
+  enum {
+    VT_CUSTOM = 4
+  };
+  const flatbuffers::Vector<int8_t> *custom() const {
+    return GetPointer<const flatbuffers::Vector<int8_t> *>(VT_CUSTOM);
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_CUSTOM) &&
+           verifier.VerifyVector(custom()) &&
+           verifier.EndTable();
+  }
+  CustomQuantizationT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(CustomQuantizationT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<CustomQuantization> Pack(flatbuffers::FlatBufferBuilder &_fbb, const CustomQuantizationT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct CustomQuantizationBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_custom(flatbuffers::Offset<flatbuffers::Vector<int8_t>> custom) {
+    fbb_.AddOffset(CustomQuantization::VT_CUSTOM, custom);
+  }
+  explicit CustomQuantizationBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  CustomQuantizationBuilder &operator=(const CustomQuantizationBuilder &);
+  flatbuffers::Offset<CustomQuantization> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<CustomQuantization>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<CustomQuantization> CreateCustomQuantization(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    flatbuffers::Offset<flatbuffers::Vector<int8_t>> custom = 0) {
+  CustomQuantizationBuilder builder_(_fbb);
+  builder_.add_custom(custom);
+  return builder_.Finish();
+}
+
+inline flatbuffers::Offset<CustomQuantization> CreateCustomQuantizationDirect(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    const std::vector<int8_t> *custom = nullptr) {
+  return tflite::CreateCustomQuantization(
+      _fbb,
+      custom ? _fbb.CreateVector<int8_t>(*custom) : 0);
+}
+
+flatbuffers::Offset<CustomQuantization> CreateCustomQuantization(flatbuffers::FlatBufferBuilder &_fbb, const CustomQuantizationT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
 struct QuantizationParametersT : public flatbuffers::NativeTable {
   typedef QuantizationParameters TableType;
   std::vector<float> min;
   std::vector<float> max;
   std::vector<float> scale;
   std::vector<int64_t> zero_point;
+  QuantizationDetailsUnion details;
   QuantizationParametersT() {
   }
 };
@@ -2040,7 +2190,9 @@ struct QuantizationParameters FLATBUFFERS_FINAL_CLASS : private flatbuffers::Tab
     VT_MIN = 4,
     VT_MAX = 6,
     VT_SCALE = 8,
-    VT_ZERO_POINT = 10
+    VT_ZERO_POINT = 10,
+    VT_DETAILS_TYPE = 12,
+    VT_DETAILS = 14
   };
   const flatbuffers::Vector<float> *min() const {
     return GetPointer<const flatbuffers::Vector<float> *>(VT_MIN);
@@ -2054,6 +2206,16 @@ struct QuantizationParameters FLATBUFFERS_FINAL_CLASS : private flatbuffers::Tab
   const flatbuffers::Vector<int64_t> *zero_point() const {
     return GetPointer<const flatbuffers::Vector<int64_t> *>(VT_ZERO_POINT);
   }
+  QuantizationDetails details_type() const {
+    return static_cast<QuantizationDetails>(GetField<uint8_t>(VT_DETAILS_TYPE, 0));
+  }
+  const void *details() const {
+    return GetPointer<const void *>(VT_DETAILS);
+  }
+  template<typename T> const T *details_as() const;
+  const CustomQuantization *details_as_CustomQuantization() const {
+    return details_type() == QuantizationDetails_CustomQuantization ? static_cast<const CustomQuantization *>(details()) : nullptr;
+  }
   bool Verify(flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyOffset(verifier, VT_MIN) &&
@@ -2064,6 +2226,9 @@ struct QuantizationParameters FLATBUFFERS_FINAL_CLASS : private flatbuffers::Tab
            verifier.VerifyVector(scale()) &&
            VerifyOffset(verifier, VT_ZERO_POINT) &&
            verifier.VerifyVector(zero_point()) &&
+           VerifyField<uint8_t>(verifier, VT_DETAILS_TYPE) &&
+           VerifyOffset(verifier, VT_DETAILS) &&
+           VerifyQuantizationDetails(verifier, details(), details_type()) &&
            verifier.EndTable();
   }
   QuantizationParametersT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
@@ -2071,6 +2236,10 @@ struct QuantizationParameters FLATBUFFERS_FINAL_CLASS : private flatbuffers::Tab
   static flatbuffers::Offset<QuantizationParameters> Pack(flatbuffers::FlatBufferBuilder &_fbb, const QuantizationParametersT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
+template<> inline const CustomQuantization *QuantizationParameters::details_as<CustomQuantization>() const {
+  return details_as_CustomQuantization();
+}
+
 struct QuantizationParametersBuilder {
   flatbuffers::FlatBufferBuilder &fbb_;
   flatbuffers::uoffset_t start_;
@@ -2086,6 +2255,12 @@ struct QuantizationParametersBuilder {
   void add_zero_point(flatbuffers::Offset<flatbuffers::Vector<int64_t>> zero_point) {
     fbb_.AddOffset(QuantizationParameters::VT_ZERO_POINT, zero_point);
   }
+  void add_details_type(QuantizationDetails details_type) {
+    fbb_.AddElement<uint8_t>(QuantizationParameters::VT_DETAILS_TYPE, static_cast<uint8_t>(details_type), 0);
+  }
+  void add_details(flatbuffers::Offset<void> details) {
+    fbb_.AddOffset(QuantizationParameters::VT_DETAILS, details);
+  }
   explicit QuantizationParametersBuilder(flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
@@ -2103,12 +2278,16 @@ inline flatbuffers::Offset<QuantizationParameters> CreateQuantizationParameters(
     flatbuffers::Offset<flatbuffers::Vector<float>> min = 0,
     flatbuffers::Offset<flatbuffers::Vector<float>> max = 0,
     flatbuffers::Offset<flatbuffers::Vector<float>> scale = 0,
-    flatbuffers::Offset<flatbuffers::Vector<int64_t>> zero_point = 0) {
+    flatbuffers::Offset<flatbuffers::Vector<int64_t>> zero_point = 0,
+    QuantizationDetails details_type = QuantizationDetails_NONE,
+    flatbuffers::Offset<void> details = 0) {
   QuantizationParametersBuilder builder_(_fbb);
+  builder_.add_details(details);
   builder_.add_zero_point(zero_point);
   builder_.add_scale(scale);
   builder_.add_max(max);
   builder_.add_min(min);
+  builder_.add_details_type(details_type);
   return builder_.Finish();
 }
 
@@ -2117,13 +2296,17 @@ inline flatbuffers::Offset<QuantizationParameters> CreateQuantizationParametersD
     const std::vector<float> *min = nullptr,
     const std::vector<float> *max = nullptr,
     const std::vector<float> *scale = nullptr,
-    const std::vector<int64_t> *zero_point = nullptr) {
+    const std::vector<int64_t> *zero_point = nullptr,
+    QuantizationDetails details_type = QuantizationDetails_NONE,
+    flatbuffers::Offset<void> details = 0) {
   return tflite::CreateQuantizationParameters(
       _fbb,
       min ? _fbb.CreateVector<float>(*min) : 0,
       max ? _fbb.CreateVector<float>(*max) : 0,
       scale ? _fbb.CreateVector<float>(*scale) : 0,
-      zero_point ? _fbb.CreateVector<int64_t>(*zero_point) : 0);
+      zero_point ? _fbb.CreateVector<int64_t>(*zero_point) : 0,
+      details_type,
+      details);
 }
 
 flatbuffers::Offset<QuantizationParameters> CreateQuantizationParameters(flatbuffers::FlatBufferBuilder &_fbb, const QuantizationParametersT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
@@ -7534,6 +7717,32 @@ inline flatbuffers::Offset<Model> CreateModelDirect(
 
 flatbuffers::Offset<Model> CreateModel(flatbuffers::FlatBufferBuilder &_fbb, const ModelT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
+inline CustomQuantizationT *CustomQuantization::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new CustomQuantizationT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void CustomQuantization::UnPackTo(CustomQuantizationT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = custom(); if (_e) { _o->custom.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->custom[_i] = _e->Get(_i); } } };
+}
+
+inline flatbuffers::Offset<CustomQuantization> CustomQuantization::Pack(flatbuffers::FlatBufferBuilder &_fbb, const CustomQuantizationT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateCustomQuantization(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<CustomQuantization> CreateCustomQuantization(flatbuffers::FlatBufferBuilder &_fbb, const CustomQuantizationT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const CustomQuantizationT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _custom = _o->custom.size() ? _fbb.CreateVector(_o->custom) : 0;
+  return tflite::CreateCustomQuantization(
+      _fbb,
+      _custom);
+}
+
 inline QuantizationParametersT *QuantizationParameters::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
   auto _o = new QuantizationParametersT();
   UnPackTo(_o, _resolver);
@@ -7547,6 +7756,8 @@ inline void QuantizationParameters::UnPackTo(QuantizationParametersT *_o, const
   { auto _e = max(); if (_e) { _o->max.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->max[_i] = _e->Get(_i); } } };
   { auto _e = scale(); if (_e) { _o->scale.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->scale[_i] = _e->Get(_i); } } };
   { auto _e = zero_point(); if (_e) { _o->zero_point.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->zero_point[_i] = _e->Get(_i); } } };
+  { auto _e = details_type(); _o->details.type = _e; };
+  { auto _e = details(); if (_e) _o->details.value = QuantizationDetailsUnion::UnPack(_e, details_type(), _resolver); };
 }
 
 inline flatbuffers::Offset<QuantizationParameters> QuantizationParameters::Pack(flatbuffers::FlatBufferBuilder &_fbb, const QuantizationParametersT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
@@ -7561,12 +7772,16 @@ inline flatbuffers::Offset<QuantizationParameters> CreateQuantizationParameters(
   auto _max = _o->max.size() ? _fbb.CreateVector(_o->max) : 0;
   auto _scale = _o->scale.size() ? _fbb.CreateVector(_o->scale) : 0;
   auto _zero_point = _o->zero_point.size() ? _fbb.CreateVector(_o->zero_point) : 0;
+  auto _details_type = _o->details.type;
+  auto _details = _o->details.Pack(_fbb);
   return tflite::CreateQuantizationParameters(
       _fbb,
       _min,
       _max,
       _scale,
-      _zero_point);
+      _zero_point,
+      _details_type,
+      _details);
 }
 
 inline TensorT *Tensor::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
@@ -9775,6 +9990,75 @@ inline flatbuffers::Offset<Model> CreateModel(flatbuffers::FlatBufferBuilder &_f
       _metadata_buffer);
 }
 
+inline bool VerifyQuantizationDetails(flatbuffers::Verifier &verifier, const void *obj, QuantizationDetails type) {
+  switch (type) {
+    case QuantizationDetails_NONE: {
+      return true;
+    }
+    case QuantizationDetails_CustomQuantization: {
+      auto ptr = reinterpret_cast<const CustomQuantization *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    default: return false;
+  }
+}
+
+inline bool VerifyQuantizationDetailsVector(flatbuffers::Verifier &verifier, const flatbuffers::Vector<flatbuffers::Offset<void>> *values, const flatbuffers::Vector<uint8_t> *types) {
+  if (!values || !types) return !values && !types;
+  if (values->size() != types->size()) return false;
+  for (flatbuffers::uoffset_t i = 0; i < values->size(); ++i) {
+    if (!VerifyQuantizationDetails(
+        verifier,  values->Get(i), types->GetEnum<QuantizationDetails>(i))) {
+      return false;
+    }
+  }
+  return true;
+}
+
+inline void *QuantizationDetailsUnion::UnPack(const void *obj, QuantizationDetails type, const flatbuffers::resolver_function_t *resolver) {
+  switch (type) {
+    case QuantizationDetails_CustomQuantization: {
+      auto ptr = reinterpret_cast<const CustomQuantization *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    default: return nullptr;
+  }
+}
+
+inline flatbuffers::Offset<void> QuantizationDetailsUnion::Pack(flatbuffers::FlatBufferBuilder &_fbb, const flatbuffers::rehasher_function_t *_rehasher) const {
+  switch (type) {
+    case QuantizationDetails_CustomQuantization: {
+      auto ptr = reinterpret_cast<const CustomQuantizationT *>(value);
+      return CreateCustomQuantization(_fbb, ptr, _rehasher).Union();
+    }
+    default: return 0;
+  }
+}
+
+inline QuantizationDetailsUnion::QuantizationDetailsUnion(const QuantizationDetailsUnion &u) FLATBUFFERS_NOEXCEPT : type(u.type), value(nullptr) {
+  switch (type) {
+    case QuantizationDetails_CustomQuantization: {
+      value = new CustomQuantizationT(*reinterpret_cast<CustomQuantizationT *>(u.value));
+      break;
+    }
+    default:
+      break;
+  }
+}
+
+inline void QuantizationDetailsUnion::Reset() {
+  switch (type) {
+    case QuantizationDetails_CustomQuantization: {
+      auto ptr = reinterpret_cast<CustomQuantizationT *>(value);
+      delete ptr;
+      break;
+    }
+    default: break;
+  }
+  value = nullptr;
+  type = QuantizationDetails_NONE;
+}
+
 inline bool VerifyBuiltinOptions(flatbuffers::Verifier &verifier, const void *obj, BuiltinOptions type) {
   switch (type) {
     case BuiltinOptions_NONE: {
diff --git a/tensorflow/lite/testing/BUILD b/tensorflow/lite/testing/BUILD
index df448e8a880..22ffed43cc0 100644
--- a/tensorflow/lite/testing/BUILD
+++ b/tensorflow/lite/testing/BUILD
@@ -162,8 +162,10 @@ cc_library(
         ":test_runner",
         "//tensorflow/lite:builtin_op_data",
         "//tensorflow/lite:framework",
+        "//tensorflow/lite:string_util",
         "//tensorflow/lite/delegates/flex:delegate",
         "//tensorflow/lite/kernels:builtin_ops",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -248,8 +250,9 @@ cc_library(
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
-        "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:tensorflow",
+        "//tensorflow/lite:string_util",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -263,6 +266,8 @@ cc_test(
     ],
     deps = [
         ":tf_driver",
+        "//tensorflow/lite:string_util",
+        "@com_google_absl//absl/strings",
         "@com_google_googletest//:gtest_main",
     ],
 )
diff --git a/tensorflow/lite/testing/generate_examples.py b/tensorflow/lite/testing/generate_examples.py
index 81b5ed80987..2b129df766a 100644
--- a/tensorflow/lite/testing/generate_examples.py
+++ b/tensorflow/lite/testing/generate_examples.py
@@ -780,38 +780,45 @@ def make_constant_tests(zip_path):
 def make_binary_op_tests(zip_path, binary_operator):
   """Make a set of tests to do binary ops with and without broadcast."""
 
-  # These parameters are split because we don't support broadcasting.
-  test_parameters = [{
-      "dtype": [tf.float32, tf.int32],
-      "input_shape_1": [[1, 3, 4, 3]],
-      "input_shape_2": [[1, 3, 4, 3]],
-      "activation": [True]
-  }, {
-      "dtype": [tf.float32],
-      "input_shape_1": [[5]],
-      "input_shape_2": [[5]],
-      "activation": [False, True]
-  }, {
-      "dtype": [tf.float32, tf.int32],
-      "input_shape_1": [[1, 3, 4, 3]],
-      "input_shape_2": [[3]],
-      "activation": [True, False]
-  }, {
-      "dtype": [tf.float32, tf.int32],
-      "input_shape_1": [[3]],
-      "input_shape_2": [[1, 3, 4, 3]],
-      "activation": [True, False]
-  }, {
-      "dtype": [tf.float32],
-      "input_shape_1": [[]],
-      "input_shape_2": [[]],
-      "activation": [False]
-  }, {
-      "dtype": [tf.float32],
-      "input_shape_1": [[0]],
-      "input_shape_2": [[1]],
-      "activation": [False]
-  }]
+  test_parameters = [
+      # Avoid creating all combinations to keep the test size small.
+      {
+          "dtype": [tf.float32, tf.int32],
+          "input_shape_1": [[1, 3, 4, 3]],
+          "input_shape_2": [[1, 3, 4, 3]],
+          "activation": [True],
+      },
+      {
+          "dtype": [tf.float32],
+          "input_shape_1": [[5]],
+          "input_shape_2": [[5]],
+          "activation": [False, True],
+      },
+      {
+          "dtype": [tf.float32, tf.int32, tf.int64],
+          "input_shape_1": [[1, 3, 4, 3]],
+          "input_shape_2": [[3]],
+          "activation": [True, False],
+      },
+      {
+          "dtype": [tf.float32, tf.int32],
+          "input_shape_1": [[3]],
+          "input_shape_2": [[1, 3, 4, 3]],
+          "activation": [True, False],
+      },
+      {
+          "dtype": [tf.float32],
+          "input_shape_1": [[]],
+          "input_shape_2": [[]],
+          "activation": [False],
+      },
+      {
+          "dtype": [tf.float32],
+          "input_shape_1": [[0]],
+          "input_shape_2": [[1]],
+          "activation": [False],
+      }
+  ]
 
   def build_graph(parameters):
     """Builds the graph given the current parameters."""
@@ -3242,12 +3249,30 @@ def make_sparse_to_dense_tests(zip_path):
 def make_pack_tests(zip_path):
   """Make a set of tests to do stack."""
 
-  test_parameters = [{
-      "base_shape": [[3, 4, 3], [3, 4], [5]],
-      "num_tensors": [1, 2, 3, 4, 5, 6],
-      "axis": [0, 1, 2, 3],
-      "additional_shape": [1, 2, 3],
-  }]
+  test_parameters = [
+      # Avoid creating all combinations to keep the test size small.
+      {
+          "dtype": [tf.float32],
+          "base_shape": [[3, 4, 3], [3, 4], [5]],
+          "num_tensors": [1, 2, 3, 4, 5, 6],
+          "axis": [0, 1, 2, 3],
+          "additional_shape": [1, 2, 3],
+      },
+      {
+          "dtype": [tf.int32],
+          "base_shape": [[3, 4, 3], [3, 4], [5]],
+          "num_tensors": [6],
+          "axis": [0, 1, 2, 3],
+          "additional_shape": [1, 2, 3],
+      },
+      {
+          "dtype": [tf.int64],
+          "base_shape": [[3, 4, 3], [3, 4], [5]],
+          "num_tensors": [5],
+          "axis": [0, 1, 2, 3],
+          "additional_shape": [1, 2, 3],
+      }
+  ]
 
   def get_shape(parameters):
     """Return a tweaked version of 'base_shape'."""
@@ -3261,7 +3286,9 @@ def make_pack_tests(zip_path):
     all_tensors = []
     for n in range(0, parameters["num_tensors"]):
       input_tensor = tf.placeholder(
-          dtype=tf.float32, name=("input%d" % n), shape=get_shape(parameters))
+          dtype=parameters["dtype"],
+          name=("input%d" % n),
+          shape=get_shape(parameters))
       all_tensors.append(input_tensor)
     out = tf.stack(all_tensors, parameters["axis"])
     return all_tensors, [out]
diff --git a/tensorflow/lite/testing/generated_examples_zip_test.cc b/tensorflow/lite/testing/generated_examples_zip_test.cc
index 49f7b527bb7..aedea52065f 100644
--- a/tensorflow/lite/testing/generated_examples_zip_test.cc
+++ b/tensorflow/lite/testing/generated_examples_zip_test.cc
@@ -94,6 +94,13 @@ std::map<string, string> kBrokenTests = {
     {R"(^\/div.*activation=True.*dtype=tf\.int32)", "112968789"},
     {R"(^\/floor_div.*activation=True.*dtype=tf\.int32)", "112968789"},
     {R"(^\/floor_mod.*activation=True.*dtype=tf\.int32)", "112968789"},
+    {R"(^\/floor_mod.*activation=True.*dtype=tf\.int64)", "112968789"},
+
+    {R"(^\/sub.*dtype=tf\.int64)", "119126484"},
+    {R"(^\/div.*dtype=tf\.int64)", "119126484"},
+    {R"(^\/mul.*dtype=tf\.int64)", "119126484"},
+    {R"(^\/add.*dtype=tf\.int64)", "119126484"},
+    {R"(^\/floor_div.*dtype=tf\.int64)", "119126484"},
 };
 
 // Allows test data to be unarchived into a temporary directory and makes
diff --git a/tensorflow/lite/testing/model_coverage/model_coverage_lib.py b/tensorflow/lite/testing/model_coverage/model_coverage_lib.py
index ce8ef0b1960..804e328d9da 100644
--- a/tensorflow/lite/testing/model_coverage/model_coverage_lib.py
+++ b/tensorflow/lite/testing/model_coverage/model_coverage_lib.py
@@ -20,9 +20,9 @@ from __future__ import print_function
 
 import numpy as np
 
+from tensorflow.core.framework import graph_pb2 as _graph_pb2
 from tensorflow.lite.python import convert_saved_model as _convert_saved_model
 from tensorflow.lite.python import lite as _lite
-from tensorflow.core.framework import graph_pb2 as _graph_pb2
 from tensorflow.python import keras as _keras
 from tensorflow.python.client import session as _session
 from tensorflow.python.framework.importer import import_graph_def as _import_graph_def
@@ -166,18 +166,20 @@ def evaluate_keras_model(filename):
   return lambda input_data: [keras_model.predict(input_data)]
 
 
-# TODO(nupurgarg): Make this function a parameter to test_frozen_graph (and
-# related functions) in order to make it easy to use different data generators.
-def compare_models_random_data(tflite_model, tf_eval_func, tolerance=5):
-  """Compares TensorFlow and TFLite models with random data.
+def compare_models(tflite_model, tf_eval_func, input_data=None, tolerance=5):
+  """Compares TensorFlow and TFLite models.
+
+  Unless the input data is provided, the models are compared with random data.
 
   Args:
     tflite_model: Serialized TensorFlow Lite model.
     tf_eval_func: Lambda function that takes in input data and outputs the
       results of the TensorFlow model ([np.ndarray data] : [np.ndarray result]).
+    input_data: np.ndarray to pass into models during inference. (default None)
     tolerance: Decimal place to check accuracy to. (default 5)
   """
-  input_data = _generate_random_input_data(tflite_model)
+  if input_data is None:
+    input_data = _generate_random_input_data(tflite_model)
   tf_results = tf_eval_func(input_data)
   tflite_results = _evaluate_tflite_model(tflite_model, input_data)
   for tf_result, tflite_result in zip(tf_results, tflite_results):
@@ -253,6 +255,7 @@ def test_frozen_graph(filename,
                       input_arrays,
                       output_arrays,
                       input_shapes=None,
+                      input_data=None,
                       **kwargs):
   """Validates the TensorFlow frozen graph converts to a TFLite model.
 
@@ -267,6 +270,7 @@ def test_frozen_graph(filename,
       integers representing input shapes (e.g., {"foo" : [1, 16, 16, 3]}).
       Automatically determined when input shapes is None (e.g., {"foo" : None}).
         (default None)
+    input_data: np.ndarray to pass into models during inference. (default None)
     **kwargs: Additional arguments to be passed into the converter.
   """
   converter = _lite.TFLiteConverter.from_frozen_graph(
@@ -274,13 +278,14 @@ def test_frozen_graph(filename,
   tflite_model = _convert(converter, **kwargs)
 
   tf_eval_func = evaluate_frozen_graph(filename, input_arrays, output_arrays)
-  compare_models_random_data(tflite_model, tf_eval_func)
+  compare_models(tflite_model, tf_eval_func, input_data=input_data)
 
 
 def test_saved_model(directory,
                      input_shapes=None,
                      tag_set=None,
                      signature_key=None,
+                     input_data=None,
                      **kwargs):
   """Validates the TensorFlow SavedModel converts to a TFLite model.
 
@@ -296,6 +301,7 @@ def test_saved_model(directory,
     tag_set: Set of tags identifying the MetaGraphDef within the SavedModel to
       analyze. All tags in the tag set must be present.
     signature_key: Key identifying SignatureDef containing inputs and outputs.
+    input_data: np.ndarray to pass into models during inference. (default None)
     **kwargs: Additional arguments to be passed into the converter.
   """
   converter = _lite.TFLiteConverter.from_saved_model(
@@ -306,10 +312,14 @@ def test_saved_model(directory,
   tflite_model = _convert(converter, **kwargs)
 
   tf_eval_func = evaluate_saved_model(directory, tag_set, signature_key)
-  compare_models_random_data(tflite_model, tf_eval_func)
+  compare_models(tflite_model, tf_eval_func, input_data=input_data)
 
 
-def test_keras_model(filename, input_arrays=None, input_shapes=None, **kwargs):
+def test_keras_model(filename,
+                     input_arrays=None,
+                     input_shapes=None,
+                     input_data=None,
+                     **kwargs):
   """Validates the tf.keras model converts to a TFLite model.
 
   Converts the tf.keras model to TFLite and checks the accuracy of the model on
@@ -322,6 +332,7 @@ def test_keras_model(filename, input_arrays=None, input_shapes=None, **kwargs):
       integers representing input shapes (e.g., {"foo" : [1, 16, 16, 3]}).
       Automatically determined when input shapes is None (e.g., {"foo" : None}).
         (default None)
+    input_data: np.ndarray to pass into models during inference. (default None)
     **kwargs: Additional arguments to be passed into the converter.
   """
   converter = _lite.TFLiteConverter.from_keras_model_file(
@@ -329,4 +340,4 @@ def test_keras_model(filename, input_arrays=None, input_shapes=None, **kwargs):
   tflite_model = _convert(converter, **kwargs)
 
   tf_eval_func = evaluate_keras_model(filename)
-  compare_models_random_data(tflite_model, tf_eval_func)
+  compare_models(tflite_model, tf_eval_func, input_data=input_data)
diff --git a/tensorflow/lite/testing/model_coverage/testdata/grace_hopper.jpg b/tensorflow/lite/testing/model_coverage/testdata/grace_hopper.jpg
new file mode 100644
index 00000000000..d2a427810f6
Binary files /dev/null and b/tensorflow/lite/testing/model_coverage/testdata/grace_hopper.jpg differ
diff --git a/tensorflow/lite/testing/test_runner.h b/tensorflow/lite/testing/test_runner.h
index 303155b072b..7cda8b5ec13 100644
--- a/tensorflow/lite/testing/test_runner.h
+++ b/tensorflow/lite/testing/test_runner.h
@@ -54,12 +54,12 @@ class TestRunner {
 
   // Define the contents of the given input tensor. The given 'id' is
   // guaranteed to be one of the ids returned by GetInputs().
-  virtual void SetInput(int id, const string& csv_values) = 0;
+  virtual void SetInput(int id, const string& values_as_string) = 0;
 
   // Define what should be expected for an output tensor after Invoke() runs.
   // The given 'id' is guaranteed to be one of the ids returned by
   // GetOutputs().
-  virtual void SetExpectation(int id, const string& csv_values) = 0;
+  virtual void SetExpectation(int id, const string& values_as_string) = 0;
 
   // Run the model.
   virtual void Invoke() = 0;
diff --git a/tensorflow/lite/testing/tf_driver.cc b/tensorflow/lite/testing/tf_driver.cc
index 36c556ba049..ffd76e8dc7e 100644
--- a/tensorflow/lite/testing/tf_driver.cc
+++ b/tensorflow/lite/testing/tf_driver.cc
@@ -17,9 +17,11 @@ limitations under the License.
 #include <fstream>
 #include <iostream>
 
+#include "absl/strings/escaping.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/lite/string_util.h"
 #include "tensorflow/lite/testing/join.h"
 #include "tensorflow/lite/testing/split.h"
-#include "tensorflow/core/lib/gtl/array_slice.h"
 
 namespace tflite {
 namespace testing {
@@ -34,13 +36,38 @@ tensorflow::Tensor CreateTensor(const tensorflow::DataType type,
 }
 
 template <typename T>
-void FillTensorWithData(tensorflow::Tensor* tensor, const string& csv_values) {
-  auto data = tensor->flat<T>();
+int FillTensorWithData(tensorflow::Tensor* tensor,
+                       const string& values_as_string) {
+  const auto& values = testing::Split<T>(values_as_string, ",");
 
-  const auto& values = testing::Split<T>(csv_values, ",");
-  for (int i = 0; i < values.size(); i++) {
-    data(i) = values[i];
+  if (values.size() == tensor->NumElements()) {
+    auto data = tensor->flat<T>();
+    for (int i = 0; i < values.size(); i++) {
+      data(i) = values[i];
+    }
   }
+
+  return values.size();
+}
+
+// Assumes 'values_as_string' is a hex string that gets converted into a
+// TF Lite DynamicBuffer. Strings are then extracted and copied into the
+// TensorFlow tensor.
+int FillTensorWithTfLiteHexString(tensorflow::Tensor* tensor,
+                                  const string& values_as_string) {
+  string s = absl::HexStringToBytes(values_as_string);
+
+  int num_strings = values_as_string.empty() ? 0 : GetStringCount(s.data());
+
+  if (num_strings == tensor->NumElements()) {
+    auto data = tensor->flat<string>();
+    for (size_t i = 0; i < num_strings; ++i) {
+      auto ref = GetString(s.data(), i);
+      data(i).assign(ref.str, ref.len);
+    }
+  }
+
+  return num_strings;
 }
 
 template <typename T>
@@ -57,6 +84,22 @@ string TensorDataToCsvString(const tensorflow::Tensor& tensor) {
   return Join(data.data(), data.size(), ",");
 }
 
+string TensorDataToTfLiteHexString(const tensorflow::Tensor& tensor) {
+  DynamicBuffer dynamic_buffer;
+
+  auto data = tensor.flat<string>();
+  for (int i = 0; i < tensor.NumElements(); ++i) {
+    dynamic_buffer.AddString(data(i).data(), data(i).size());
+  }
+
+  char* char_buffer = nullptr;
+  size_t size = dynamic_buffer.WriteToBuffer(&char_buffer);
+  string s = absl::BytesToHexString({char_buffer, size});
+  free(char_buffer);
+
+  return s;
+}
+
 }  // namespace
 
 TfDriver::TfDriver(const std::vector<string>& input_layer,
@@ -107,28 +150,44 @@ void TfDriver::LoadModel(const string& bin_file_path) {
   }
 }
 
-void TfDriver::SetInput(int id, const string& csv_values) {
-  if (!IsValid()) return;
-
-  auto tensor = CreateTensor(input_types_[id], input_shapes_[id]);
-  switch (input_types_[id]) {
-    case tensorflow::DT_FLOAT: {
-      FillTensorWithData<float>(&tensor, csv_values);
+void TfDriver::SetInput(const string& values_as_string,
+                        tensorflow::Tensor* tensor) {
+  int num_values_available = 0;
+  switch (tensor->dtype()) {
+    case tensorflow::DT_FLOAT:
+      num_values_available =
+          FillTensorWithData<float>(tensor, values_as_string);
       break;
-    }
-    case tensorflow::DT_INT32: {
-      FillTensorWithData<int32_t>(&tensor, csv_values);
+    case tensorflow::DT_INT32:
+      num_values_available =
+          FillTensorWithData<int32_t>(tensor, values_as_string);
       break;
-    }
-    case tensorflow::DT_UINT8: {
-      FillTensorWithData<uint8_t>(&tensor, csv_values);
+    case tensorflow::DT_UINT8:
+      num_values_available =
+          FillTensorWithData<uint8_t>(tensor, values_as_string);
+      break;
+    case tensorflow::DT_STRING:
+      num_values_available =
+          FillTensorWithTfLiteHexString(tensor, values_as_string);
       break;
-    }
     default:
-      fprintf(stderr, "Unsupported type %d in SetInput\n", input_types_[id]);
-      Invalidate("Unsupported tensor data type");
+      Invalidate(absl::StrCat("Unsupported tensor type ",
+                              tensorflow::DataType_Name(tensor->dtype()),
+                              " in SetInput"));
       return;
   }
+
+  if (tensor->NumElements() != num_values_available) {
+    Invalidate(absl::StrCat("Needed ", tensor->NumElements(),
+                            " values for input tensor, but was given ",
+                            num_values_available, " instead."));
+  }
+}
+
+void TfDriver::SetInput(int id, const string& values_as_string) {
+  if (!IsValid()) return;
+  auto tensor = CreateTensor(input_types_[id], input_shapes_[id]);
+  SetInput(values_as_string, &tensor);
   input_tensors_[input_names_[id]] = tensor;
 }
 
@@ -145,43 +204,54 @@ void TfDriver::ResetTensor(int id) {
       break;
     }
     default:
-      fprintf(stderr, "Unsupported type %d in ResetTensor\n", input_types_[id]);
-      Invalidate("Unsupported tensor data type");
+      Invalidate(absl::StrCat("Unsupported tensor type ", input_types_[id],
+                              tensorflow::DataType_Name(input_types_[id]),
+                              " in ResetInput"));
       return;
   }
 }
 
-void TfDriver::ReshapeTensor(int id, const string& csv_values) {
-  input_shapes_[id] = Split<int64_t>(csv_values, ",");
+void TfDriver::ReshapeTensor(int id, const string& values_as_string) {
+  input_shapes_[id] = Split<int64_t>(values_as_string, ",");
   input_tensors_[input_names_[id]] =
       CreateTensor(input_types_[id], input_shapes_[id]);
   ResetTensor(id);
 }
 
-string TfDriver::ReadOutput(int id) {
-  if (!IsValid()) return "";
-  switch (output_tensors_[id].dtype()) {
+string TfDriver::ReadOutput(const tensorflow::Tensor& tensor) {
+  switch (tensor.dtype()) {
     case tensorflow::DT_FLOAT:
-      return TensorDataToCsvString<float>(output_tensors_[id]);
+      return TensorDataToCsvString<float>(tensor);
     case tensorflow::DT_INT32:
-      return TensorDataToCsvString<int32_t>(output_tensors_[id]);
+      return TensorDataToCsvString<int32_t>(tensor);
+    case tensorflow::DT_INT64:
+      return TensorDataToCsvString<tensorflow::int64>(tensor);
     case tensorflow::DT_UINT8:
-      return TensorDataToCsvString<uint8_t>(output_tensors_[id]);
+      return TensorDataToCsvString<uint8_t>(tensor);
+    case tensorflow::DT_STRING:
+      return TensorDataToTfLiteHexString(tensor);
+    case tensorflow::DT_BOOL:
+      return TensorDataToCsvString<bool>(tensor);
     default:
-      fprintf(stderr, "Unsupported type %d in ResetTensor\n", input_types_[id]);
-      Invalidate("Unsupported tensor data type");
+      Invalidate(absl::StrCat("Unsupported tensor type ",
+                              tensorflow::DataType_Name(tensor.dtype()),
+                              " in ReadOutput"));
       return "";
   }
 }
 
+string TfDriver::ReadOutput(int id) {
+  if (!IsValid()) return "";
+  return ReadOutput(output_tensors_[id]);
+}
+
 void TfDriver::Invoke() {
   if (!IsValid()) return;
   auto status = session_->Run({input_tensors_.begin(), input_tensors_.end()},
                               output_names_, {}, &output_tensors_);
   if (!status.ok()) {
-    Invalidate(
-        "Failed to run input data on graph. Make sure the correct value is "
-        "defined for the input and output arrays.");
+    Invalidate(absl::StrCat("TensorFlow failed to run graph:",
+                            status.error_message()));
   }
 }
 
diff --git a/tensorflow/lite/testing/tf_driver.h b/tensorflow/lite/testing/tf_driver.h
index f10689cb58c..46b18980b95 100644
--- a/tensorflow/lite/testing/tf_driver.h
+++ b/tensorflow/lite/testing/tf_driver.h
@@ -39,23 +39,27 @@ class TfDriver : public TestRunner {
   ~TfDriver() override {}
 
   void LoadModel(const string& bin_file_path) override;
-  void SetInput(int id, const string& csv_values) override;
+  void SetInput(int id, const string& values_as_string) override;
   void Invoke() override;
   string ReadOutput(int id) override;
 
   const std::vector<int>& GetInputs() override { return input_ids_; }
   const std::vector<int>& GetOutputs() override { return output_ids_; }
-  void ReshapeTensor(int id, const string& csv_values) override;
+  void ReshapeTensor(int id, const string& values_as_string) override;
   // Note: ResetTensor only works for input tensor.
   void ResetTensor(int id) override;
 
   // no-op. SetInput will overwrite existing data .
   void AllocateTensors() override {}
   // no-op. Tf driver is not supposed to check the results.
-  void SetExpectation(int id, const string& csv_values) override {}
+  void SetExpectation(int id, const string& values_as_string) override {}
   // tf driver is not supposed to check the results.
   bool CheckResults() override { return false; }
 
+ protected:
+  void SetInput(const string& values_as_string, tensorflow::Tensor*);
+  string ReadOutput(const tensorflow::Tensor& tensor);
+
  private:
   std::unique_ptr<tensorflow::Session> session_;
   std::vector<int> input_ids_;
diff --git a/tensorflow/lite/testing/tf_driver_test.cc b/tensorflow/lite/testing/tf_driver_test.cc
index d178ccf1e3f..4381fe4c19d 100644
--- a/tensorflow/lite/testing/tf_driver_test.cc
+++ b/tensorflow/lite/testing/tf_driver_test.cc
@@ -16,6 +16,8 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/strings/escaping.h"
+#include "tensorflow/lite/string_util.h"
 
 namespace tflite {
 namespace testing {
@@ -23,6 +25,68 @@ namespace {
 
 using ::testing::ElementsAre;
 
+class TestDriver : public TfDriver {
+ public:
+  // No need for a full TfDriver. We just want to test the read/write methods.
+  TestDriver() : TfDriver({}, {}, {}, {}) {}
+  string WriteAndReadBack(tensorflow::DataType type,
+                          const std::vector<int64_t>& shape,
+                          const string& values) {
+    tensorflow::Tensor t = {
+        type,
+        tensorflow::TensorShape{tensorflow::gtl::ArraySlice<tensorflow::int64>{
+            reinterpret_cast<const tensorflow::int64*>(shape.data()),
+            shape.size()}}};
+    SetInput(values, &t);
+    return ReadOutput(t);
+  }
+};
+
+TEST(TfDriverTest, ReadingAndWrintingValues) {
+  TestDriver driver;
+  ASSERT_EQ(driver.WriteAndReadBack(tensorflow::DT_FLOAT, {1, 2, 2},
+                                    "0.10,0.20,0.30,0.40"),
+            "0.1,0.2,0.3,0.4");
+  ASSERT_EQ(driver.WriteAndReadBack(tensorflow::DT_INT32, {1, 2, 2},
+                                    "10,40,100,-100"),
+            "10,40,100,-100");
+  ASSERT_EQ(driver.WriteAndReadBack(tensorflow::DT_UINT8, {1, 2, 2},
+                                    "48,49,121, 122"),
+            "0,1,y,z");
+}
+
+TEST(TfDriverTest, ReadingAndWrintingValuesStrings) {
+  TestDriver driver;
+
+  auto set_buffer = [](const std::vector<string>& values, string* buffer) {
+    DynamicBuffer dynamic_buffer;
+    for (const string& s : values) {
+      dynamic_buffer.AddString(s.data(), s.size());
+    }
+
+    char* char_b = nullptr;
+    int size = dynamic_buffer.WriteToBuffer(&char_b);
+    *buffer = absl::BytesToHexString(absl::string_view(char_b, size));
+    free(char_b);
+  };
+
+  string buffer;
+
+  set_buffer({"", "", "", ""}, &buffer);
+  ASSERT_EQ(driver.WriteAndReadBack(tensorflow::DT_STRING, {1, 2, 2}, buffer),
+            buffer);
+
+  // Note that if we pass the empty string we get the "empty" buffer (where all
+  // the strings are empty).
+  ASSERT_EQ(driver.WriteAndReadBack(tensorflow::DT_STRING, {1, 2, 2}, ""),
+            buffer);
+
+  set_buffer({"AB", "ABC", "X", "YZ"}, &buffer);
+
+  ASSERT_EQ(driver.WriteAndReadBack(tensorflow::DT_STRING, {1, 2, 2}, buffer),
+            buffer);
+}
+
 TEST(TfDriverTest, SimpleTest) {
   std::unique_ptr<TfDriver> runner(
       new TfDriver({"a", "b", "c", "d"}, {"float", "float", "float", "float"},
diff --git a/tensorflow/lite/testing/tflite_driver.cc b/tensorflow/lite/testing/tflite_driver.cc
index 8619f5f8366..3a0febb780c 100644
--- a/tensorflow/lite/testing/tflite_driver.cc
+++ b/tensorflow/lite/testing/tflite_driver.cc
@@ -16,8 +16,10 @@ limitations under the License.
 
 #include <iostream>
 
+#include "absl/strings/escaping.h"
 #include "tensorflow/lite/builtin_op_data.h"
 #include "tensorflow/lite/delegates/flex/delegate.h"
+#include "tensorflow/lite/string_util.h"
 #include "tensorflow/lite/testing/split.h"
 
 namespace tflite {
@@ -105,6 +107,7 @@ class TfLiteDriver::Expectation {
     if (tensor_size != num_elements_) {
       std::cerr << "Expected a tensor with " << num_elements_
                 << " elements, got " << tensor_size << std::endl;
+      std::cerr << "while checking tensor " << tensor.name << std::endl;
       return false;
     }
 
@@ -143,7 +146,11 @@ TfLiteDriver::TfLiteDriver(bool use_nnapi, const string& delegate_name)
   }
 }
 
-TfLiteDriver::~TfLiteDriver() {}
+TfLiteDriver::~TfLiteDriver() {
+  for (TfLiteTensor* t : tensors_to_deallocate_) {
+    free(t->data.raw);
+  }
+}
 
 void TfLiteDriver::AllocateTensors() {
   if (must_allocate_tensors_) {
@@ -232,6 +239,17 @@ void TfLiteDriver::SetInput(int id, const string& csv_values) {
       SetTensorData(values, &tensor->data);
       break;
     }
+    case kTfLiteString: {
+      string s = absl::HexStringToBytes(csv_values);
+
+      tensor->data.raw = reinterpret_cast<char*>(malloc(s.size()));
+      tensor->bytes = s.size();
+      memcpy(tensor->data.raw, s.data(), s.size());
+
+      // We must remember to free the memory we allocated above.
+      tensors_to_deallocate_.push_back(tensor);
+      break;
+    }
     default:
       fprintf(stderr, "Unsupported type %d in SetInput\n", tensor->type);
       Invalidate("Unsupported tensor data type");
diff --git a/tensorflow/lite/testing/tflite_driver.h b/tensorflow/lite/testing/tflite_driver.h
index 785baf0f004..d8b40565bac 100644
--- a/tensorflow/lite/testing/tflite_driver.h
+++ b/tensorflow/lite/testing/tflite_driver.h
@@ -59,6 +59,7 @@ class TfLiteDriver : public TestRunner {
   std::unique_ptr<Interpreter> interpreter_;
   std::map<int, std::unique_ptr<Expectation>> expected_output_;
   bool must_allocate_tensors_ = true;
+  std::vector<TfLiteTensor*> tensors_to_deallocate_;
 };
 
 }  // namespace testing
diff --git a/tensorflow/lite/toco/g3doc/README.md b/tensorflow/lite/toco/g3doc/README.md
index 2153b6cc636..e1be8fab3ad 100644
--- a/tensorflow/lite/toco/g3doc/README.md
+++ b/tensorflow/lite/toco/g3doc/README.md
@@ -1,3 +1,3 @@
 # TOCO
 
-These files have moved to [../../g3doc/tflite_convert](../../g3doc/tflite_convert)
+These files have moved to [../../g3doc/convert](../../g3doc/convert)
diff --git a/tensorflow/lite/toco/import_tensorflow.cc b/tensorflow/lite/toco/import_tensorflow.cc
index 86d55f3e15d..96f3c6a6ab9 100644
--- a/tensorflow/lite/toco/import_tensorflow.cc
+++ b/tensorflow/lite/toco/import_tensorflow.cc
@@ -199,23 +199,35 @@ tensorflow::Status ImportShape(
         input_dims,
     int* input_flat_size, Shape* shape) {
   std::vector<int> input_dims_only_sizes;
+  bool zero_sized_shape = false;
   for (auto& d : input_dims) {
-    if (d.size() == 0) {
-      // Some TensorFlow shapes contain a 0 dim, effectively making
-      // them of flat size 0 even though they have other nonzero dims.
-      // This breaks our invariant, that array dims can't be 0.
-      // For now, tweaking this to record a 0-D shape instead.
-      shape->mutable_dims()->clear();
-      if (input_flat_size != nullptr) *input_flat_size = 0;
-      return tensorflow::Status::OK();
-    }
     // TensorFlow's shapes use int64s, while TOCO uses ints.
     if (d.size() > std::numeric_limits<int>::max()) {
       return tensorflow::errors::InvalidArgument("Shape element overflows");
     }
-
+    if (d.size() == 0) {
+      zero_sized_shape = true;
+    }
     input_dims_only_sizes.push_back(d.size());
   }
+
+  // Note that up to this point we were OK with the input shape containing
+  // elements valued -1 or 0, which are perfectly legal in tensorflow. However
+  // our CheckValidShapeDimensions() insists on them being >= 1, with the
+  // exception of the "scalar" shape [0]. The main issue with zero-values shape
+  // elements is that the corresponding arrays don't contain any data and the
+  // allocation code gets a bit confused. It seems that the code expects an
+  // empty shape for zero-sized shapes, so we will do just that, except for the
+  // [0] case.
+  // TODO(b/119325030): In order to correctly import the "scalar" shapes the
+  // following test must include "&& input_dims_only_sizes.size() > 1", but
+  // that seems to slow everything down a lot.
+  if (zero_sized_shape) {
+    shape->mutable_dims()->clear();
+    if (input_flat_size != nullptr) *input_flat_size = 0;
+    return tensorflow::Status::OK();
+  }
+
   *shape->mutable_dims() = input_dims_only_sizes;
 
   if (input_flat_size == nullptr) return tensorflow::Status::OK();
@@ -1122,28 +1134,53 @@ tensorflow::Status ConvertConcatOperator(
   return tensorflow::Status::OK();
 }
 
+static constexpr int kAnyNumInputs = -1;
+
+enum FlexSupport { kFlexOk, kFlexNotOk };
+
 // This method supports simple operators without additional attributes.
-template <typename Op>
-tensorflow::Status ConvertSimpleOperator(
+// Converts a simple operator that takes no attributes. The list of inputs is
+// taken from the given NodeDef, and its number must match NumInputs, unless
+// kAnyNumInputs is passed in. If kFlexOk is passed in the resulting operator
+// will be eligible for being exported as a flex op.
+template <typename Op, int NumInputs, FlexSupport flex>
+tensorflow::Status ConvertSimpleOperatorGeneric(
     const NodeDef& node, const TensorFlowImportFlags& tf_import_flags,
     Model* model) {
+  if (NumInputs != kAnyNumInputs) {
+    TF_QCHECK_OK(CheckInputsCount(node, tf_import_flags, NumInputs));
+  }
   auto* op = new Op;
   const int num_inputs = GetInputsCount(node, tf_import_flags);
   for (int i = 0; i < num_inputs; ++i) {
     op->inputs.push_back(node.input(i));
   }
   op->outputs.push_back(node.name());
+
+  if (flex == kFlexOk) {
+    RetainTensorFlowNodeDef(node, op);
+  }
+
   model->operators.emplace_back(op);
   return tensorflow::Status::OK();
 }
 
-// This method supports simple operators without additional attributes.
-template <typename Op, unsigned int NumInputs>
+// Convert a simple operator which is not valid as a flex op.
+template <typename Op, int NumInputs = kAnyNumInputs>
 tensorflow::Status ConvertSimpleOperator(
     const NodeDef& node, const TensorFlowImportFlags& tf_import_flags,
     Model* model) {
-  TF_QCHECK_OK(CheckInputsCount(node, tf_import_flags, NumInputs));
-  return ConvertSimpleOperator<Op>(node, tf_import_flags, model);
+  return ConvertSimpleOperatorGeneric<Op, NumInputs, kFlexNotOk>(
+      node, tf_import_flags, model);
+}
+
+// Convert a simple operator which is valid as a flex op.
+template <typename Op, int NumInputs = kAnyNumInputs>
+tensorflow::Status ConvertSimpleOperatorFlexOk(
+    const NodeDef& node, const TensorFlowImportFlags& tf_import_flags,
+    Model* model) {
+  return ConvertSimpleOperatorGeneric<Op, NumInputs, kFlexOk>(
+      node, tf_import_flags, model);
 }
 
 void GetOutputNamesFromNodeDef(const NodeDef& node,
@@ -2203,7 +2240,7 @@ ConverterMapType GetTensorFlowNodeConverterMapForFlex() {
 ConverterMapType GetTensorFlowNodeConverterMap() {
   return std::unordered_map<std::string, ConverterType>({
       {"Add", ConvertSimpleOperator<AddOperator, 2>},
-      {"AddN", ConvertSimpleOperator<AddNOperator>},
+      {"AddN", ConvertSimpleOperatorFlexOk<AddNOperator>},
       {"All", ConvertSimpleOperator<TensorFlowAllOperator>},
       {"Any", ConvertReduceOperator<TensorFlowAnyOperator>},
       {"ArgMax", ConvertArgMaxOperator},
diff --git a/tensorflow/lite/toco/import_tensorflow_test.cc b/tensorflow/lite/toco/import_tensorflow_test.cc
index 30aa725f1db..0be358b1f7b 100644
--- a/tensorflow/lite/toco/import_tensorflow_test.cc
+++ b/tensorflow/lite/toco/import_tensorflow_test.cc
@@ -90,74 +90,167 @@ NodeDef BuildNode(
   return node;
 }
 
-class ShapeImportTest : public ::testing::TestWithParam<tensorflow::DataType> {
- protected:
-  ShapeImportTest() {}
+namespace {
+void BuildConstNode(std::initializer_list<int64_t> shape,
+                    tensorflow::DataType dtype, int64_t num_elements,
+                    NodeDef* node) {
+  node->set_op("Const");
+  node->set_name("Node1");
 
-  void BuildConstNode(std::initializer_list<int64_t> shape,
-                      tensorflow::DataType dtype, int64_t num_elements,
-                      NodeDef* node) {
-    node->set_op("Const");
-    node->set_name("Node1");
+  // An attribute describing the type of this const node.
+  AttrValue dtype_attr;
+  SetAttrValue(dtype, &dtype_attr);
+  (*node->mutable_attr())["dtype"] = dtype_attr;
 
-    // An attribute describing the type of this const node.
-    AttrValue dtype_attr;
-    SetAttrValue(dtype, &dtype_attr);
-    (*node->mutable_attr())["dtype"] = dtype_attr;
-
-    // An attribute describing the content of this const node.
-    tensorflow::TensorProto t;
-    t.set_dtype(dtype);
-    auto* s = t.mutable_tensor_shape();
-    for (auto d : shape) {
-      s->add_dim()->set_size(d);
-    }
-
-    // TODO(ahentz): also need to test via tensor_content()
-    switch (dtype) {
-      case DT_FLOAT:
-        for (int64_t i = 0; i < num_elements; ++i) {
-          t.add_float_val(i / 10000.0);
-        }
-        break;
-      case DT_INT32:
-        for (int64_t i = 0; i < num_elements; ++i) {
-          t.add_int_val(i % std::numeric_limits<int>::max());
-        }
-        break;
-      case DT_QUINT8:
-        for (int64_t i = 0; i < num_elements; ++i) {
-          t.add_int_val(i % std::numeric_limits<uint8_t>::max());
-        }
-        break;
-      case DT_INT64:
-        for (int64_t i = 0; i < num_elements; ++i) {
-          t.add_int64_val(i);
-        }
-        break;
-      case DT_STRING:
-        break;
-      case DT_BOOL:
-        for (int64_t i = 0; i < num_elements; ++i) {
-          t.add_bool_val(i % 2);
-        }
-        break;
-      case DT_COMPLEX64:
-        for (int64_t i = 0; i < num_elements; ++i) {
-          t.add_scomplex_val(i / 10000.0);
-          t.add_scomplex_val(-i / 10000.0);
-        }
-        break;
-      default:
-        break;
-    }
-
-    AttrValue value_attr;
-    SetAttrValue(t, &value_attr);
-    (*node->mutable_attr())["value"] = value_attr;
+  // An attribute describing the content of this const node.
+  tensorflow::TensorProto t;
+  t.set_dtype(dtype);
+  auto* s = t.mutable_tensor_shape();
+  for (auto d : shape) {
+    s->add_dim()->set_size(d);
   }
+
+  // TODO(ahentz): also need to test via tensor_content()
+  switch (dtype) {
+    case DT_FLOAT:
+      for (int64_t i = 0; i < num_elements; ++i) {
+        t.add_float_val(i / 10000.0);
+      }
+      break;
+    case DT_INT32:
+      for (int64_t i = 0; i < num_elements; ++i) {
+        t.add_int_val(i % std::numeric_limits<int>::max());
+      }
+      break;
+    case DT_QUINT8:
+      for (int64_t i = 0; i < num_elements; ++i) {
+        t.add_int_val(i % std::numeric_limits<uint8_t>::max());
+      }
+      break;
+    case DT_INT64:
+      for (int64_t i = 0; i < num_elements; ++i) {
+        t.add_int64_val(i);
+      }
+      break;
+    case DT_STRING:
+      break;
+    case DT_BOOL:
+      for (int64_t i = 0; i < num_elements; ++i) {
+        t.add_bool_val(i % 2);
+      }
+      break;
+    case DT_COMPLEX64:
+      for (int64_t i = 0; i < num_elements; ++i) {
+        t.add_scomplex_val(i / 10000.0);
+        t.add_scomplex_val(-i / 10000.0);
+      }
+      break;
+    default:
+      break;
+  }
+
+  AttrValue value_attr;
+  SetAttrValue(t, &value_attr);
+  (*node->mutable_attr())["value"] = value_attr;
+}
+}  //  namespace
+
+class ShapeImportTest : public ::testing::TestWithParam<tensorflow::DataType> {
 };
 
+TEST_P(ShapeImportTest, ShapeElementIsNegative) {
+  NodeDef node;
+  BuildConstNode({1, -2, 10}, GetParam(), 0, &node);
+  auto status = ImportNode(node);
+  EXPECT_EQ(
+      status.error_message(),
+      "Tensor shape should not include negative values\n\t (while processing "
+      "node 'Node1')");
+}
+
+TEST_P(ShapeImportTest, ShapeElementIsZero) {
+  NodeDef node;
+  // Const nodes with zero-sized, non-scalar shapes are still not importable.
+  BuildConstNode({1, 0, 10}, GetParam(), 0, &node);
+
+  Model model;
+  EXPECT_TRUE(ImportNode(node, &model).ok());
+
+  const auto& array = model.GetArray("Node1");
+  EXPECT_THAT(array.shape().dims(), ::testing::ElementsAre());
+}
+
+// Note how this is subtly different thant ShapeElementIsZero above, where toco
+// removes all shape information after import.
+TEST_P(ShapeImportTest, ShapeIsOneDimZero) {
+  NodeDef node;
+  BuildConstNode({0}, GetParam(), 0, &node);
+
+  Model model;
+  EXPECT_TRUE(ImportNode(node, &model).ok());
+
+  const auto& array = model.GetArray("Node1");
+  // We would like to have [0] shapes actually import correctly, but
+  // for some reason that slows everything down.
+  EXPECT_THAT(array.shape().dims(), ::testing::ElementsAre());
+}
+
+TEST_P(ShapeImportTest, ShapeElementTooLarge) {
+  NodeDef node;
+  BuildConstNode({3000000000}, GetParam(), 0, &node);
+  auto status = ImportNode(node);
+  EXPECT_EQ(status.error_message(),
+            "Shape element overflows\n\t (while processing node 'Node1')");
+}
+
+TEST_P(ShapeImportTest, ShapeTooLarge) {
+  NodeDef node;
+  BuildConstNode({1000000, 2000000, 2000000, 2000000}, GetParam(), 0, &node);
+  auto status = ImportNode(node);
+  EXPECT_EQ(status.error_message(),
+            "Tensor shape is too large\n\t (while processing node 'Node1')");
+}
+
+TEST_P(ShapeImportTest, ValidShapeButZeroElements) {
+  NodeDef node;
+  BuildConstNode({1, 2, 2, 2}, GetParam(), 0, &node);
+  auto status = ImportNode(node);
+  EXPECT_THAT(status.error_message(),
+              ::testing::MatchesRegex(
+                  "Neither input_content .0. nor .*_val .0. have the right "
+                  "dimensions .8. for this .* tensor\n\t .while processing "
+                  "node 'Node1'."));
+}
+
+std::vector<tensorflow::DataType> TestTypes() {
+  return {DT_FLOAT, DT_INT32, DT_INT64, DT_BOOL, DT_QUINT8, DT_COMPLEX64};
+}
+
+INSTANTIATE_TEST_CASE_P(ShapeImportTest, ShapeImportTest,
+                        ::testing::ValuesIn(TestTypes()));
+
+TEST(ImportTest, Complex64ConstNode) {
+  NodeDef node;
+  BuildConstNode({1, 2, 3}, DT_COMPLEX64, 6, &node);
+  Model model;
+  EXPECT_TRUE(ImportNode(node, &model).ok());
+  const auto& array = model.GetArray("Node1");
+  EXPECT_EQ(ArrayDataType::kComplex64, array.data_type);
+  EXPECT_EQ(6, array.GetBuffer<ArrayDataType::kComplex64>().Length());
+  int64_t i = 0;
+  for (const auto& datum : array.GetBuffer<ArrayDataType::kComplex64>().data) {
+    EXPECT_EQ(i / 10000.0f, std::real(datum));
+    EXPECT_EQ(-i / 10000.0f, std::imag(datum));
+    i++;
+  }
+}
+
+std::vector<std::pair<tensorflow::DataType, ArrayDataType>> UnaryTestTypes() {
+  return {{DT_FLOAT, ArrayDataType::kFloat},
+          {DT_INT32, ArrayDataType::kInt32},
+          {DT_INT64, ArrayDataType::kInt64}};
+}
+
 class TypeImportTest : public ::testing::TestWithParam<
                            std::pair<tensorflow::DataType, ArrayDataType>> {
  protected:
@@ -177,79 +270,6 @@ class TypeImportTest : public ::testing::TestWithParam<
   }
 };
 
-std::vector<tensorflow::DataType> TestTypes() {
-  return {DT_FLOAT, DT_INT32, DT_INT64, DT_BOOL, DT_QUINT8, DT_COMPLEX64};
-}
-
-TEST_P(ShapeImportTest, ShapeElementIsNegative) {
-  NodeDef node;
-  BuildConstNode({1, -2, 10}, GetParam(), 0, &node);
-  auto status = ImportNode(node);
-  EXPECT_EQ(
-      status.error_message(),
-      "Tensor shape should not include negative values\n\t (while processing "
-      "node 'Node1')");
-}
-INSTANTIATE_TEST_CASE_P(ShapeElementIsNegative, ShapeImportTest,
-                        ::testing::ValuesIn(TestTypes()));
-
-TEST_P(ShapeImportTest, ShapeElementTooLarge) {
-  NodeDef node;
-  BuildConstNode({3000000000}, GetParam(), 0, &node);
-  auto status = ImportNode(node);
-  EXPECT_EQ(status.error_message(),
-            "Shape element overflows\n\t (while processing node 'Node1')");
-}
-INSTANTIATE_TEST_CASE_P(ShapeElementTooLarge, ShapeImportTest,
-                        ::testing::ValuesIn(TestTypes()));
-
-TEST_P(ShapeImportTest, ShapeTooLarge) {
-  NodeDef node;
-  BuildConstNode({1000000, 2000000, 2000000, 2000000}, GetParam(), 0, &node);
-  auto status = ImportNode(node);
-  EXPECT_EQ(status.error_message(),
-            "Tensor shape is too large\n\t (while processing node 'Node1')");
-}
-INSTANTIATE_TEST_CASE_P(ShapeTooLarge, ShapeImportTest,
-                        ::testing::ValuesIn(TestTypes()));
-
-TEST_P(ShapeImportTest, ValidShapeButZeroElements) {
-  NodeDef node;
-  BuildConstNode({1, 2, 2, 2}, GetParam(), 0, &node);
-  auto status = ImportNode(node);
-  EXPECT_THAT(status.error_message(),
-              ::testing::MatchesRegex(
-                  "Neither input_content .0. nor .*_val .0. have the right "
-                  "dimensions .8. for this .* tensor\n\t .while processing "
-                  "node 'Node1'."));
-}
-INSTANTIATE_TEST_CASE_P(ValidShapeButZeroElements, ShapeImportTest,
-                        ::testing::ValuesIn(TestTypes()));
-
-TEST_P(ShapeImportTest, Complex64ConstNode) {
-  NodeDef node;
-  BuildConstNode({1, 2, 3}, DT_COMPLEX64, 6, &node);
-  Model model;
-  EXPECT_TRUE(ImportNode(node, &model).ok());
-  const auto& array = model.GetArray("Node1");
-  EXPECT_EQ(ArrayDataType::kComplex64, array.data_type);
-  EXPECT_EQ(6, array.GetBuffer<ArrayDataType::kComplex64>().Length());
-  int64_t i = 0;
-  for (const auto& datum : array.GetBuffer<ArrayDataType::kComplex64>().data) {
-    EXPECT_EQ(i / 10000.0f, std::real(datum));
-    EXPECT_EQ(-i / 10000.0f, std::imag(datum));
-    i++;
-  }
-}
-INSTANTIATE_TEST_CASE_P(Complex64ConstNode, ShapeImportTest,
-                        ::testing::ValuesIn({DT_COMPLEX64}));
-
-std::vector<std::pair<tensorflow::DataType, ArrayDataType>> UnaryTestTypes() {
-  return {{DT_FLOAT, ArrayDataType::kFloat},
-          {DT_INT32, ArrayDataType::kInt32},
-          {DT_INT64, ArrayDataType::kInt64}};
-}
-
 TEST_P(TypeImportTest, BasicTypeInference) {
   NodeDef node;
   BuildUnaryNode("Atan", GetParam().first, &node);
diff --git a/tensorflow/lite/toco/tflite/operator.cc b/tensorflow/lite/toco/tflite/operator.cc
index 015029e1cbd..84ae4482469 100644
--- a/tensorflow/lite/toco/tflite/operator.cc
+++ b/tensorflow/lite/toco/tflite/operator.cc
@@ -1526,11 +1526,11 @@ std::vector<std::unique_ptr<BaseOperator>> BuildOperatorList(
                                                   OperatorType::kUnsupported,
                                                   enable_select_tf_ops));
 
-  // There operators are supported by Toco, but not by TF Lite, and has no
-  // attributes.
-  ops.push_back(
-      MakeUnique<SimpleOperator<AddNOperator>>("ADDN", OperatorType::kAddN));
-  // Simple Operators.
+  // SimpleOperator was designed to export CUSTOM TF Lite ops, but has since
+  // been modified to also export builtins. As TOCO evolved we added warnings
+  // when custom ops are exported but SimpleOperator bypasses thoses. To
+  // prevent user confusion we are settling on using SimpleOperator only for
+  // builtins.
   ops.push_back(MakeUnique<SimpleOperator<DequantizeOperator>>(
       "DEQUANTIZE", OperatorType::kDequantize));
   ops.push_back(
diff --git a/tensorflow/lite/toco/tflite/whitelisted_flex_ops.cc b/tensorflow/lite/toco/tflite/whitelisted_flex_ops.cc
index 221e9b8e34e..d251589b483 100644
--- a/tensorflow/lite/toco/tflite/whitelisted_flex_ops.cc
+++ b/tensorflow/lite/toco/tflite/whitelisted_flex_ops.cc
@@ -55,6 +55,7 @@ bool IsWhitelistedFlexOp(const std::string& tensorflow_op_name) {
           "AssignSub",
           "AudioSpectrogram",
           "AvgPool",
+          "AvgPool3D",
           "AvgPoolGrad",
           "BatchMatMul",
           "BatchNormWithGlobalNormalization",
@@ -78,6 +79,7 @@ bool IsWhitelistedFlexOp(const std::string& tensorflow_op_name) {
           "Conv2D",
           "Conv2DBackpropFilter",
           "Conv2DBackpropInput",
+          "Conv3D",
           "Cos",
           "Cosh",
           "CropAndResize",
@@ -168,6 +170,7 @@ bool IsWhitelistedFlexOp(const std::string& tensorflow_op_name) {
           "Max",
           "Maximum",
           "MaxPool",
+          "MaxPool3D",
           "MaxPoolGrad",
           "MaxPoolGradGrad",
           "MaxPoolGradGradV2",
diff --git a/tensorflow/lite/toco/toco_port.cc b/tensorflow/lite/toco/toco_port.cc
index 0881065a23f..fb8c1b8337f 100644
--- a/tensorflow/lite/toco/toco_port.cc
+++ b/tensorflow/lite/toco/toco_port.cc
@@ -66,8 +66,9 @@ namespace file {
 // Conversion to our wrapper Status.
 tensorflow::Status ToStatus(const ::util::Status& uts) {
   if (!uts.ok()) {
-    return tensorflow::Status(tensorflow::errors::Code(uts.error_code()),
-                              uts.error_message());
+    return tensorflow::Status(
+        tensorflow::errors::Code(::util::RetrieveErrorCode(uts)),
+        uts.error_message());
   }
   return tensorflow::Status::OK();
 }
diff --git a/tensorflow/lite/toco/tooling_util.h b/tensorflow/lite/toco/tooling_util.h
index 92ce82632f9..53131824b53 100644
--- a/tensorflow/lite/toco/tooling_util.h
+++ b/tensorflow/lite/toco/tooling_util.h
@@ -338,8 +338,9 @@ tensorflow::Status NumElements(const std::vector<T>& shape, U* num_elements) {
       return tensorflow::errors::InvalidArgument(
           "Tensor shape should not include negative values");
     }
-    if (static_cast<uint64_t>(dim) >
-        std::numeric_limits<U>::max() / *num_elements) {
+    if (*num_elements != 0 &&
+        static_cast<uint64_t>(dim) >
+            std::numeric_limits<U>::max() / *num_elements) {
       *num_elements = 0;
       return tensorflow::errors::InvalidArgument("Tensor shape is too large");
     }
diff --git a/tensorflow/lite/toco/tooling_util_test.cc b/tensorflow/lite/toco/tooling_util_test.cc
index e3826cb8fde..6f1c9c563ad 100644
--- a/tensorflow/lite/toco/tooling_util_test.cc
+++ b/tensorflow/lite/toco/tooling_util_test.cc
@@ -109,6 +109,10 @@ TEST(NumElementsTest, Int) {
   EXPECT_TRUE(status.ok());
   EXPECT_EQ(count, 2146435072);
 
+  status = NumElements(std::vector<int>{1024, 0, 2048}, &count);
+  EXPECT_TRUE(status.ok());
+  EXPECT_EQ(count, 0);
+
   status = NumElements(std::vector<int>{1, 2, -3}, &count);
   EXPECT_EQ(status.error_message(), kNegativeValuesMessage);
 
diff --git a/tensorflow/lite/tools/benchmark/BUILD b/tensorflow/lite/tools/benchmark/BUILD
index 7646bdcca9f..583046ad73d 100644
--- a/tensorflow/lite/tools/benchmark/BUILD
+++ b/tensorflow/lite/tools/benchmark/BUILD
@@ -112,6 +112,7 @@ cc_library(
         "//tensorflow/lite:string_util",
         "//tensorflow/lite/kernels:builtin_ops",
         "//tensorflow/lite/profiling:profile_summarizer",
+        "@gemmlowp",
     ],
 )
 
diff --git a/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc b/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc
index e063a144b66..777d9dde7dd 100644
--- a/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc
+++ b/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc
@@ -29,6 +29,10 @@ limitations under the License.
 #include "tensorflow/lite/string_util.h"
 #include "tensorflow/lite/tools/benchmark/logging.h"
 
+#ifdef GEMMLOWP_PROFILING
+#include "third_party/gemmlowp/profiling/profiler.h"
+#endif
+
 #ifdef TFLITE_CUSTOM_OPS_HEADER
 void RegisterSelectedOps(::tflite::MutableOpResolver* resolver);
 #endif
@@ -62,6 +66,21 @@ void ProfilingListener::OnSingleRunEnd() {
   summarizer_.ProcessProfiles(profile_events, *interpreter_);
 }
 
+void GemmlowpProfilingListener::OnBenchmarkStart(
+    const BenchmarkParams& params) {
+#ifdef GEMMLOWP_PROFILING
+  gemmlowp::RegisterCurrentThreadForProfiling();
+  gemmlowp::StartProfiling();
+#endif
+}
+
+void GemmlowpProfilingListener::OnBenchmarkEnd(
+    const BenchmarkResults& results) {
+#ifdef GEMMLOWP_PROFILING
+  gemmlowp::FinishProfiling();
+#endif
+}
+
 namespace {
 
 std::vector<std::string> Split(const std::string& str, const char delim) {
@@ -176,13 +195,12 @@ BenchmarkParams GetDefaultParams() {
 }  // namespace
 
 BenchmarkTfLiteModel::BenchmarkTfLiteModel()
-    : BenchmarkModel(GetDefaultParams()) {
-  AddListener(&profiling_listener_);
-}
+    : BenchmarkTfLiteModel(GetDefaultParams()) {}
 
 BenchmarkTfLiteModel::BenchmarkTfLiteModel(BenchmarkParams params)
     : BenchmarkModel(std::move(params)) {
   AddListener(&profiling_listener_);
+  AddListener(&gemmlowp_profiling_listener_);
 }
 
 std::vector<Flag> BenchmarkTfLiteModel::GetFlags() {
diff --git a/tensorflow/lite/tools/benchmark/benchmark_tflite_model.h b/tensorflow/lite/tools/benchmark/benchmark_tflite_model.h
index 8ad3a5dbe50..401ab5427d3 100644
--- a/tensorflow/lite/tools/benchmark/benchmark_tflite_model.h
+++ b/tensorflow/lite/tools/benchmark/benchmark_tflite_model.h
@@ -27,7 +27,7 @@ limitations under the License.
 namespace tflite {
 namespace benchmark {
 
-// Dumps profiling events if profiling is enabled
+// Dumps profiling events if profiling is enabled.
 class ProfilingListener : public BenchmarkListener {
  public:
   explicit ProfilingListener() : interpreter_(nullptr), has_profiles_(false) {}
@@ -47,11 +47,21 @@ class ProfilingListener : public BenchmarkListener {
   bool has_profiles_;
 };
 
+// Dumps gemmlowp profiling events if gemmlowp profiling is enabled.
+class GemmlowpProfilingListener : public BenchmarkListener {
+ public:
+  virtual ~GemmlowpProfilingListener() {}
+
+  void OnBenchmarkStart(const BenchmarkParams& params) override;
+
+  void OnBenchmarkEnd(const BenchmarkResults& results) override;
+};
+
 // Benchmarks a TFLite model by running tflite interpreter.
 class BenchmarkTfLiteModel : public BenchmarkModel {
  public:
   BenchmarkTfLiteModel();
-  BenchmarkTfLiteModel(BenchmarkParams params);
+  explicit BenchmarkTfLiteModel(BenchmarkParams params);
   virtual ~BenchmarkTfLiteModel() {}
 
   std::vector<Flag> GetFlags() override;
@@ -74,6 +84,7 @@ class BenchmarkTfLiteModel : public BenchmarkModel {
   std::unique_ptr<tflite::Interpreter> interpreter;
   std::vector<InputLayerInfo> inputs;
   ProfilingListener profiling_listener_;
+  GemmlowpProfilingListener gemmlowp_profiling_listener_;
 };
 
 }  // namespace benchmark
diff --git a/tensorflow/lite/tools/make/build_ios_universal_lib.sh b/tensorflow/lite/tools/make/build_ios_universal_lib.sh
index 477883b4909..6e0d262827f 100755
--- a/tensorflow/lite/tools/make/build_ios_universal_lib.sh
+++ b/tensorflow/lite/tools/make/build_ios_universal_lib.sh
@@ -17,7 +17,7 @@
 set -e
 
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-cd "$SCRIPT_DIR/../../../../.."
+cd "$SCRIPT_DIR/../../../.."
 
 # Build library for supported architectures and packs them in a fat binary.
 make_library() {
diff --git a/tensorflow/lite/tools/make/build_rpi_lib.sh b/tensorflow/lite/tools/make/build_rpi_lib.sh
index d4047bb0eb5..1521bb39332 100755
--- a/tensorflow/lite/tools/make/build_rpi_lib.sh
+++ b/tensorflow/lite/tools/make/build_rpi_lib.sh
@@ -17,6 +17,6 @@
 set -e
 
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-cd "$SCRIPT_DIR/../../../../.."
+cd "$SCRIPT_DIR/../../../.."
 
 CC_PREFIX=arm-linux-gnueabihf- make -j 3 -f tensorflow/lite/tools/make/Makefile TARGET=rpi TARGET_ARCH=armv7l
diff --git a/tensorflow/lite/tools/make/download_dependencies.sh b/tensorflow/lite/tools/make/download_dependencies.sh
index aa5495329b1..fa3d5d3d3b6 100755
--- a/tensorflow/lite/tools/make/download_dependencies.sh
+++ b/tensorflow/lite/tools/make/download_dependencies.sh
@@ -17,7 +17,7 @@
 set -e
 
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-cd "$SCRIPT_DIR/../../../../.."
+cd "$SCRIPT_DIR/../../../.."
 
 DOWNLOADS_DIR=tensorflow/lite/tools/make/downloads
 BZL_FILE_PATH=tensorflow/workspace.bzl
diff --git a/tensorflow/lite/tools/optimize/quantize_weights.cc b/tensorflow/lite/tools/optimize/quantize_weights.cc
index a13774f7130..de3c0b03237 100644
--- a/tensorflow/lite/tools/optimize/quantize_weights.cc
+++ b/tensorflow/lite/tools/optimize/quantize_weights.cc
@@ -182,8 +182,7 @@ std::vector<TensorInfo> GetQuantizableTensorsFromOperator(
     TensorT* tensor = subgraph->tensors[tensor_idx].get();
     // TODO(suharshs): Support shared weights, i.e. If two tensors share the
     // same weight array, things may break. (i.e. SSD object detection)
-    if (!eval_hybrid &&
-        CountTensorConsumers(model, subgraph, tensor_idx) != 1) {
+    if (CountTensorConsumers(model, subgraph, tensor_idx) != 1) {
       LOG(INFO) << "Skipping quantization of tensor " << tensor->name
                 << " that is shared between multiple multiple operations.";
       continue;
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index 0d06c49f7c7..4fe92262ba6 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -20,6 +20,8 @@ licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE"])
 
+exports_files(["platform/base.i"])
+
 load("//tensorflow:tensorflow.bzl", "if_not_windows")
 load("//tensorflow:tensorflow.bzl", "tf_cuda_library")
 load("//tensorflow:tensorflow.bzl", "tf_gen_op_wrapper_py")
@@ -81,6 +83,7 @@ py_library(
         "//tensorflow/python/tools:__pkg__",
         "//tensorflow/python/tools/api/generator:__pkg__",
         "//tensorflow/tools/api/tests:__pkg__",
+        "//tensorflow/tools/compatibility/update:__pkg__",
     ],
     deps = [
         ":array_ops",
@@ -142,12 +145,15 @@ py_library(
         "//tensorflow/python/compat",
         "//tensorflow/python/data",
         "//tensorflow/python/distribute:estimator_training",
+        "//tensorflow/python/eager:def_function",
         "//tensorflow/python/feature_column:feature_column_py",
         "//tensorflow/python/keras",
         "//tensorflow/python/ops/distributions",
         "//tensorflow/python/ops/linalg",
         "//tensorflow/python/ops/losses",
         "//tensorflow/python/ops/parallel_for",
+        "//tensorflow/python/ops/ragged",
+        "//tensorflow/python/ops/signal",
         "//tensorflow/python/profiler",
         "//tensorflow/python/saved_model",
         "//tensorflow/python/tools:component_api_helper",
@@ -1536,6 +1542,7 @@ py_test(
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python/eager:context",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -1851,6 +1858,7 @@ tf_gen_op_wrapper_private_py(
     visibility = [
         "//learning/brain/contrib/text:__pkg__",
         "//learning/brain/contrib/text/python/ragged:__pkg__",
+        "//tensorflow/python/ops/ragged:__pkg__",
     ],
 )
 
@@ -1859,6 +1867,7 @@ tf_gen_op_wrapper_private_py(
     visibility = [
         "//learning/brain/contrib/text:__pkg__",
         "//learning/brain/contrib/text/python/ragged:__pkg__",
+        "//tensorflow/python/ops/ragged:__pkg__",
     ],
 )
 
@@ -1867,6 +1876,7 @@ tf_gen_op_wrapper_private_py(
     visibility = [
         "//learning/brain/contrib/text:__pkg__",
         "//learning/brain/contrib/text/python/ragged:__pkg__",
+        "//tensorflow/python/ops/ragged:__pkg__",
     ],
 )
 
@@ -2133,6 +2143,7 @@ py_library(
         ":list_ops",
         ":tensor_array_ops",
         ":tensor_shape",
+        ":tensor_util",
         ":util",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python/eager:function",
@@ -3000,18 +3011,6 @@ py_library(
     ],
 )
 
-py_library(
-    name = "summary_ops",
-    srcs = ["ops/summary_ops.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":framework",
-        ":framework_for_generated_wrappers",
-        ":logging_ops_gen",
-        ":summary_op_util",
-    ],
-)
-
 py_library(
     name = "summary_ops_v2",
     srcs = ["ops/summary_ops_v2.py"],
@@ -3424,6 +3423,8 @@ py_library(
             # file):
             "training/basic_session_run_hooks.py",
             "training/checkpoint_management.py",
+            "training/distribute.py",
+            "training/distribution_strategy_context.py",
             "training/saveable_object.py",
             "training/saver.py",
             "training/session_run_hook.py",
@@ -3441,6 +3442,7 @@ py_library(
         ":control_flow_ops",
         ":data_flow_ops",
         ":device",
+        ":distribute",
         ":errors",
         ":framework",
         ":framework_for_generated_wrappers",
@@ -4004,7 +4006,6 @@ tf_py_wrap_cc(
         "platform/stacktrace_handler.i",
         "pywrap_tfe.i",
         "training/quantize_training.i",
-        "training/server_lib.i",
         "util/kernel_registry.i",
         "util/port.i",
         "util/py_checkpoint_reader.i",
@@ -5012,7 +5013,6 @@ py_library(
     deps = [
         ":client",
         ":constant_op",
-        ":errors",
         ":framework",
         ":framework_for_generated_wrappers",
         ":lib",
@@ -5021,12 +5021,10 @@ py_library(
         ":protos_all_py",
         ":pywrap_tensorflow",
         ":summary_op_util",
-        ":summary_ops",
         ":summary_ops_gen",
         ":summary_ops_v2",
         ":util",
         "//tensorflow/python/eager:context",
-        "//third_party/py/numpy",
         "@six_archive//:six",
     ],
 )
@@ -5037,7 +5035,6 @@ py_tests(
     srcs = [
         "summary/plugin_asset_test.py",
         "summary/summary_test.py",
-        "summary/text_summary_test.py",
         "summary/writer/writer_test.py",
     ],
     additional_deps = [
diff --git a/tensorflow/python/__init__.py b/tensorflow/python/__init__.py
index a2ab63bb487..5da304e38cc 100644
--- a/tensorflow/python/__init__.py
+++ b/tensorflow/python/__init__.py
@@ -91,6 +91,7 @@ from tensorflow.python.ops import spectral_ops as spectral
 from tensorflow.python.ops.distributions import distributions
 from tensorflow.python.ops.linalg import linalg
 from tensorflow.python.ops.losses import losses
+from tensorflow.python.ops import signal
 from tensorflow.python.profiler import profiler
 from tensorflow.python.saved_model import saved_model
 from tensorflow.python.summary import summary
@@ -124,6 +125,7 @@ from tensorflow.python.util.tf_export import tf_export
 
 # Eager execution
 from tensorflow.python.eager.context import executing_eagerly
+from tensorflow.python.eager.def_function import function
 from tensorflow.python.framework.ops import enable_eager_execution
 
 # Necessary for the symbols in this module to be taken into account by
diff --git a/tensorflow/python/autograph/__init__.py b/tensorflow/python/autograph/__init__.py
index fd9e60bea75..7252e0d9bf9 100644
--- a/tensorflow/python/autograph/__init__.py
+++ b/tensorflow/python/autograph/__init__.py
@@ -26,6 +26,7 @@ from tensorflow.python.autograph import operators
 from tensorflow.python.autograph import utils
 from tensorflow.python.autograph.core.converter import ConversionOptions
 from tensorflow.python.autograph.core.converter import Feature
+from tensorflow.python.autograph.core.converter import Verbosity
 from tensorflow.python.autograph.core.errors import GraphConstructionError
 from tensorflow.python.autograph.core.errors import improved_errors
 from tensorflow.python.autograph.core.errors import TfRuntimeError
@@ -58,6 +59,7 @@ _allowed_symbols = [
     'improved_errors',
     'GraphConstructionError',
     'TfRuntimeError',
+    'Verbosity',
     # Python language "extensions"
     'set_element_type',
     'set_loop_options',
diff --git a/tensorflow/python/autograph/converters/control_flow.py b/tensorflow/python/autograph/converters/control_flow.py
index a7596be2913..5853e044c53 100644
--- a/tensorflow/python/autograph/converters/control_flow.py
+++ b/tensorflow/python/autograph/converters/control_flow.py
@@ -131,14 +131,18 @@ class ControlFlowTransformer(converter.Base):
     created_in_body = body_scope.modified & returned_from_cond - defined_in
     created_in_orelse = orelse_scope.modified & returned_from_cond - defined_in
 
-    if created_in_body != created_in_orelse:
+    basic_created_in_body = tuple(
+        s for s in created_in_body if not s.is_composite())
+    basic_created_in_orelse = tuple(
+        s for s in created_in_orelse if not s.is_composite())
+    if basic_created_in_body != basic_created_in_orelse:
       raise ValueError(
           'if statement may not initialize all variables: the true branch'
           ' creates %s, while the false branch creates %s. Make sure all'
           ' these variables are initialized either in both'
           ' branches or before the if statement.' %
-          (self._fmt_symbols(created_in_body),
-           self._fmt_symbols(created_in_orelse)))
+          (self._fmt_symbols(basic_created_in_body),
+           self._fmt_symbols(basic_created_in_orelse)))
 
     # Alias the closure variables inside the conditional functions, to allow
     # the functions access to the respective variables.
@@ -160,6 +164,10 @@ class ControlFlowTransformer(converter.Base):
     node_body = ast_util.rename_symbols(node.body, alias_body_map)
     node_orelse = ast_util.rename_symbols(node.orelse, alias_orelse_map)
 
+    cond_var_name = self.ctx.namer.new_symbol('cond', body_scope.referenced)
+    body_name = self.ctx.namer.new_symbol('if_true', body_scope.referenced)
+    orelse_name = self.ctx.namer.new_symbol('if_false', orelse_scope.referenced)
+
     returned_from_cond = tuple(returned_from_cond)
     if returned_from_cond:
       if len(returned_from_cond) == 1:
@@ -181,13 +189,14 @@ class ControlFlowTransformer(converter.Base):
       # actually has some return value as well.
       cond_results = None
       # TODO(mdan): This doesn't belong here; it's specific to the operator.
-      returned_from_body = (templates.replace_as_expression('tf.constant(1)'),)
-      returned_from_orelse = (
-          templates.replace_as_expression('tf.constant(1)'),)
-
-    body_name = self.ctx.namer.new_symbol('if_true', body_scope.referenced)
-    orelse_name = self.ctx.namer.new_symbol('if_false', orelse_scope.referenced)
+      returned_from_body = (templates.replace_as_expression(
+          'ag__.match_staging_level(1, cond_var_name)',
+          cond_var_name=cond_var_name),)
+      returned_from_orelse = (templates.replace_as_expression(
+          'ag__.match_staging_level(1, cond_var_name)',
+          cond_var_name=cond_var_name),)
 
+    cond_assign = self.create_assignment(cond_var_name, node.test)
     body_def = self._create_cond_branch(
         body_name,
         aliased_orig_names=aliased_body_orig_names,
@@ -200,10 +209,10 @@ class ControlFlowTransformer(converter.Base):
         aliased_new_names=aliased_orelse_new_names,
         body=node_orelse,
         returns=returned_from_orelse)
-    cond_expr = self._create_cond_expr(cond_results, node.test, body_name,
+    cond_expr = self._create_cond_expr(cond_results, cond_var_name, body_name,
                                        orelse_name)
 
-    return body_def + orelse_def + cond_expr
+    return cond_assign + body_def + orelse_def + cond_expr
 
   def _get_loop_state(self, node):
     body_scope = anno.getanno(node, annos.NodeAnno.BODY_SCOPE)
diff --git a/tensorflow/python/autograph/converters/error_handlers_test.py b/tensorflow/python/autograph/converters/error_handlers_test.py
index 29597e1da3e..1f6c5a68217 100644
--- a/tensorflow/python/autograph/converters/error_handlers_test.py
+++ b/tensorflow/python/autograph/converters/error_handlers_test.py
@@ -20,12 +20,10 @@ from __future__ import print_function
 
 import gast
 
-from tensorflow.python.autograph.converters import control_flow
 from tensorflow.python.autograph.converters import error_handlers
 from tensorflow.python.autograph.core import converter_testing
 from tensorflow.python.autograph.core import errors
-from tensorflow.python.framework import dtypes
-from tensorflow.python.ops import random_ops
+from tensorflow.python.autograph.pyct import anno
 from tensorflow.python.platform import test
 
 
@@ -44,29 +42,12 @@ class ErrorHandlersTest(converter_testing.TestCase):
   def test_no_origin_annotation(self):
 
     def test_fn(x):
-      a = 0
-      if x:
-        a = random_ops.random_normal((2, 3), mean=0.0, dtype=dtypes.int32)
-      else:
-        a = 0
-      return a
+      return x + 1
 
-    node, ctx = self.prepare(test_fn, {
-        'random_ops': random_ops,
-        'dtypes': dtypes
-    })
-    # To simulate a function without origin info we use the control flow
-    # converter which adds a function that lacks origin info so we will not have
-    # a wrapping try/except that reraises the NotImplementedError as a
-    # GraphConstructionError.
-    node = control_flow.transform(node, ctx)
+    node, ctx = self.prepare(test_fn, {})
+    anno.delanno(node, anno.Basic.ORIGIN)
     node = error_handlers.transform(node, ctx)
-    # TODO(b/111562364): remove run_cond from traceback.
-    test_fn_try_body = node.body[0].body
-    true_fn_body = test_fn_try_body[1].body
-    false_fn_body = test_fn_try_body[2].body
-    self.assertNotIn(gast.Try, true_fn_body)
-    self.assertNotIn(gast.Try, false_fn_body)
+    self.assertIsInstance(node.body[0], gast.Return)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/autograph/converters/side_effect_guards.py b/tensorflow/python/autograph/converters/side_effect_guards.py
index 910c470f978..98e29ec8e1b 100644
--- a/tensorflow/python/autograph/converters/side_effect_guards.py
+++ b/tensorflow/python/autograph/converters/side_effect_guards.py
@@ -122,11 +122,12 @@ class SideEffectGuardTransformer(converter.Base):
       # possible, gate all remaining statements (and that may fail too, see
       # _visit_and_reindent.
       args_scope = anno.getanno(node.value, NodeAnno.ARGS_SCOPE)
+      live_out = anno.getanno(node, anno.Static.LIVE_VARS_OUT)
       # NOTE: We can't guard object attributes because they may not be writable.
       # In addition, avoid renaming well-known names.
       # TODO(mdan): Move these names into config.
-      unguarded_names = (qual_names.QN('self'), qual_names.QN('tf'))
-      guarded_args = tuple(s for s in args_scope.read
+      unguarded_names = (qual_names.QN('self'), qual_names.QN('ag__'))
+      guarded_args = tuple(s for s in live_out
                            if not s.is_composite() and s not in unguarded_names)
 
       # TODO(mdan): Include all arguments which depended on guarded_args too.
diff --git a/tensorflow/python/autograph/core/converter.py b/tensorflow/python/autograph/core/converter.py
index 59b9ebb5918..49e24895a2b 100644
--- a/tensorflow/python/autograph/core/converter.py
+++ b/tensorflow/python/autograph/core/converter.py
@@ -64,6 +64,7 @@ from __future__ import division
 from __future__ import print_function
 
 from enum import Enum
+from enum import IntEnum
 
 from tensorflow.python.autograph.core import config
 from tensorflow.python.autograph.core import naming
@@ -89,6 +90,17 @@ from tensorflow.python.autograph.pyct.static_analysis import type_info
 # TODO(mdan): Add a test specific to this converter.
 
 
+class Verbosity(IntEnum):
+  """Different levels of verbosity for printing errors.
+
+  Attributes:
+   * BRIEF: No logging, minimal error messages.
+   * VERBOSE: Detailed logging of generated code, detailed error messages.
+ """
+  BRIEF = 0
+  VERBOSE = 1
+
+
 class Feature(Enum):
   """Constants to use when selecting AutoGraph features."""
 
@@ -97,9 +109,15 @@ class Feature(Enum):
   AUTO_CONTROL_DEPS = (
       'Insert of control dependencies in the generated code.')
   DECORATORS = (
-      'Allow decorators in local functions. Note that special decorators, '
+      'Allow decorators in local functions. Note that special decorators,'
       ' like ag.convert or tf.function are allowed regardless of this toggle.')
+  ERROR_REWRITING = (
+      'Rewrite errors that occur in the generated code to indicate the source'
+      ' code to which the failing code corresponds.')
   LISTS = 'Convert list idioms, like initializers, slices, append, etc.'
+  NAME_SCOPES = (
+      'Insert name scopes that name ops according to context, like the'
+      ' function they were defined in.')
 
   def __repr__(self):
     return self.name
@@ -111,7 +129,7 @@ class ConversionOptions(object):
   Attributes:
     recursive: bool, whether to recursively convert any user functions or
       classes that the converted function may use.
-    verbose: bool, whether to log the converted code.
+    verbose: Verbosity, the level of verbosity to use.
     strip_decorators: Tuple[Callable], contains decorators that should be in
       excluded from the compiled output. By default, when converting a function
       before the decorators are applied, the compiled output will include those
@@ -126,7 +144,7 @@ class ConversionOptions(object):
 
   def __init__(self,
                recursive=False,
-               verbose=False,
+               verbose=Verbosity.VERBOSE,
                strip_decorators=None,
                force_conversion=False,
                internal_convert_user_code=True,
@@ -197,7 +215,7 @@ class ConversionOptions(object):
         constructor_name=parser.parse_expression(
             as_qualified_name(ConversionOptions)),
         recursive_val=parser.parse_expression(str(self.recursive)),
-        verbose_val=parser.parse_expression(str(self.verbose)),
+        verbose_val=parser.parse_expression(str(int(self.verbose))),
         strip_decorators_val=list_of_names(self.strip_decorators),
         force_conversion_val=parser.parse_expression(
             str(self.force_conversion)),
diff --git a/tensorflow/python/autograph/core/converter_testing.py b/tensorflow/python/autograph/core/converter_testing.py
index 0326103933f..7b0608d03fc 100644
--- a/tensorflow/python/autograph/core/converter_testing.py
+++ b/tensorflow/python/autograph/core/converter_testing.py
@@ -30,6 +30,7 @@ from tensorflow.python.autograph.core import config
 from tensorflow.python.autograph.core import converter
 from tensorflow.python.autograph.core import errors
 from tensorflow.python.autograph.core import function_wrapping
+from tensorflow.python.autograph.lang import special_functions
 from tensorflow.python.autograph.pyct import compiler
 from tensorflow.python.autograph.pyct import origin_info
 from tensorflow.python.autograph.pyct import parser
@@ -103,6 +104,7 @@ class TestCase(test.TestCase):
       fake_ag = self.make_fake_mod('fake_ag', converted_call,
                                    converter.ConversionOptions)
       fake_ag.__dict__.update(operators.__dict__)
+      fake_ag.__dict__.update(special_functions.__dict__)
       fake_ag.__dict__['utils'] = utils
       fake_ag.__dict__['rewrite_graph_construction_error'] = (
           errors.rewrite_graph_construction_error)
diff --git a/tensorflow/python/autograph/core/naming.py b/tensorflow/python/autograph/core/naming.py
index aecc9e33caa..43fcbcfc030 100644
--- a/tensorflow/python/autograph/core/naming.py
+++ b/tensorflow/python/autograph/core/naming.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.autograph.pyct import qual_names
+from tensorflow.python.util import tf_inspect
 
 
 class Namer(object):
@@ -76,6 +77,10 @@ class Namer(object):
     if not self.recursive:
       return None, False
 
+    if (live_entity is not None and tf_inspect.isfunction(live_entity) and
+        live_entity.__name__ == '<lambda>'):
+      return None, False
+
     if owner_type is not None and owner_type not in self.partial_types:
       # Members are not renamed when part of an entire converted class.
       return None, False
diff --git a/tensorflow/python/autograph/impl/api.py b/tensorflow/python/autograph/impl/api.py
index e0e07c6d5f5..69674b2be3c 100644
--- a/tensorflow/python/autograph/impl/api.py
+++ b/tensorflow/python/autograph/impl/api.py
@@ -47,7 +47,9 @@ from tensorflow.python.util import tf_inspect
 
 
 # TODO(mdan): This should behave like to_graph (e.g. convert statically).
-def convert(recursive=False, verbose=False):
+# TODO(znado): Make an alias so can write Verbosity directly without needing
+# to write converter.
+def convert(recursive=False, verbose=converter.Verbosity.VERBOSE):
   """Decorator that compiles a function to use TensorFlow ops.
 
   The decorator is dynamic - it recompiles the target whenever the decorated
@@ -58,7 +60,7 @@ def convert(recursive=False, verbose=False):
   Args:
     recursive: bool, whether to recursively convert any functions or classes
       that the converted function may use.
-    verbose: bool, whether to output the compiled code in the logs.
+    verbose: converter.Verbosity, the level of verbosity.
 
   Returns:
     Callable, a decorator that converts the given function into an equivalent
@@ -92,8 +94,7 @@ def convert(recursive=False, verbose=False):
 class RunMode(Enum):
   """Specifies the way a converted function or method should be executed in TF.
 
-  The enum values have the following semantics:
-
+  Attributes:
    * GRAPH: Call this function directly, as-is. This is suitable for functions
        that were already designed for TF graphs and contain ops.
    * PY_FUNC: Wrap this function into a py_func op. This is suitable for code
@@ -153,7 +154,7 @@ def do_not_convert(run_as=RunMode.GRAPH, return_dtypes=None):
 # TODO(mdan): Move to a private, undocumented module.
 def converted_call(f, owner, options, *args, **kwargs):
   """Compiles a function call inline. For internal use only."""
-  if options.verbose:
+  if options.verbose >= converter.Verbosity.VERBOSE:
     logging.info('Converted call: {}; owner: {}'.format(f, owner))
 
   if owner is not None:
@@ -170,13 +171,22 @@ def converted_call(f, owner, options, *args, **kwargs):
 
     f = getattr(owner, f)
 
+  if inspect_utils.isbuiltin(f):
+    return py_builtins.overload_of(f)(*args, **kwargs)
+
   # TODO(mdan): This needs cleanup.
   # In particular, we may want to avoid renaming functions altogether.
   if not options.force_conversion and conversion.is_whitelisted_for_graph(f):
-    return f(*args, **kwargs)
 
-  if inspect_utils.isbuiltin(f):
-    return py_builtins.overload_of(f)(*args, **kwargs)
+    # Args typically include `self`, as required by the conversion process.
+    # When conversion is skipped, `self` is not necessary, because the
+    # original bound method is being executed. This code removes it.
+    if tf_inspect.ismethod(f) and args:
+      f_class = inspect_utils.getmethodclass(f)
+      if args[0] is f_class:
+        args = args[1:]
+
+    return f(*args, **kwargs)
 
   # internal_convert_user_code is for example turned off when issuing a dynamic
   # call conversion from generated code while in nonrecursive mode. In that
@@ -191,6 +201,7 @@ def converted_call(f, owner, options, *args, **kwargs):
     arg_map_target = f
     f_class = inspect_utils.getmethodclass(f)
 
+    # TODO(b/119246461): This may be more elegantly handled using __get__?
     if f_class is not None:
       # If this is a method call, it may or may not include self.
       #
@@ -203,7 +214,13 @@ def converted_call(f, owner, options, *args, **kwargs):
       if owner is not None and (not args or args[0] is not owner):
         effective_args = (owner,) + args
       else:
-        effective_args = args
+        # When the owner is not specified, use the result of
+        # inspect_utils.getmethodclass.
+        # TODO(b/119246461): Make sure an owner is always specified.
+        if not args or args[0] is not f_class:
+          effective_args = (f_class,) + args
+        else:
+          effective_args = (f_class,) + args[1:]
       partial_types = (f_class,)
     else:
       effective_args = args
@@ -254,28 +271,30 @@ def converted_call(f, owner, options, *args, **kwargs):
       optional_features=options.optional_features)
 
   result = converted_f(*effective_args, **kwargs)
-  # When converting a function, we write a tmp file and import it as a module.
-  # This leaks the module's closure. Once we've executed the converted_f module
-  # and there is no more code left to be executed, we can clean up the module.
 
-  # TODO(mdan): Look into workarounds that don't suffer from refcount leaks.
-  # Possibly attach the closure as a regular closure cell, instead of relying on
-  # module globals.
-
-  # If there are callables in the result, they will fail to find their closure
-  # when called, so only delete module if all returned types are not callable.
-  flat_results = nest.flatten(result)
-  if all(map(_is_not_callable, flat_results)):
+  # The converted function's closure is simply inserted into the function's
+  # module __dict__. Since modules are permanently cached, that results in
+  # leaking the entire closure.
+  # Normally, it's not safe to delete the module because that may release said
+  # closure as well. However, in the case of converted_call we are certain the
+  # function will not be executed again, so the closure should no longer be
+  # needed so long as the function doesn't return any executable code.
+  # TODO(mdan): Attach the closure properly, using cells.
+  if all(map(_is_not_callable, nest.flatten(result))):
     del sys.modules[converted_f.__module__]
 
   return result
 
 
 def _is_not_callable(obj):
-  # TODO(brianklee): What happens if obj is a tensor wrapping a py_func?
-  return (isinstance(obj,
-                     (int, float, complex, str, bool, np.ndarray, np.generic))
-          or tensor_util.is_tensor(obj))
+  # TODO(brianklee): Handle case when obj is a tensor dependent on a py_func.
+  if isinstance(obj, (int, float, complex, str, bool)):
+    return True
+  if isinstance(obj, (np.ndarray, np.generic)):
+    return True
+  if tensor_util.is_tensor(obj):
+    return True
+  return False
 
 
 # TODO(mdan): Rename: to_ops?
@@ -283,7 +302,7 @@ def _is_not_callable(obj):
 # TODO(mdan): Remove partial_types.
 def to_graph(e,
              recursive=True,
-             verbose=False,
+             verbose=converter.Verbosity.VERBOSE,
              arg_values=None,
              arg_types=None,
              partial_types=None,
@@ -301,7 +320,7 @@ def to_graph(e,
     e: Union[Callable, Type], the Python entity to convert.
     recursive: bool, whether to recursively convert any functions that the
       converted function may call.
-    verbose: bool, whether to output the compiled code in the logs.
+    verbose: converter.Verbosity, the level of printing verbosity to use.
     arg_values: Optional[Dict[Text, Any]], value hints for symbols including
       function arguments.
     arg_types: Optional[Dict[Text, Type]], type hints for symbols including
@@ -356,6 +375,11 @@ def to_graph(e,
   if tf_inspect.isfunction(e):
     compiled.__defaults__ = e.__defaults__
 
+  if hasattr(compiled, '__globals__'):
+    # Remove self to avoid circular references. This will probably only work
+    # so long as the function is not reentrant.
+    del compiled.__globals__[name]
+
   # Need this so the source_mapping attribute is available for the context
   # manager to access for runtime errors.
   #
diff --git a/tensorflow/python/autograph/impl/api_test.py b/tensorflow/python/autograph/impl/api_test.py
index 276fb8748fe..ef577568c4e 100644
--- a/tensorflow/python/autograph/impl/api_test.py
+++ b/tensorflow/python/autograph/impl/api_test.py
@@ -28,6 +28,9 @@ from tensorflow.python.autograph.impl import api
 from tensorflow.python.autograph.pyct import parser
 from tensorflow.python.autograph.utils import py_func
 from tensorflow.python.framework import constant_op
+from tensorflow.python.keras.engine import sequential
+from tensorflow.python.keras.layers import core
+from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 from tensorflow.python.util import tf_inspect
 
@@ -319,6 +322,63 @@ class ApiTest(test.TestCase):
     # The constant has static shape so the result is a primitive not a Tensor.
     self.assertEqual(x, 1)
 
+  def test_converted_call_whitelisted_method(self):
+
+    opts = converter.ConversionOptions()
+
+    model = sequential.Sequential([
+        core.Dense(2)
+    ])
+
+    x = api.converted_call(model.call, None, opts,
+                           constant_op.constant([[0.0]]), training=True)
+
+    with self.cached_session() as sess:
+      sess.run(variables.global_variables_initializer())
+      self.assertAllEqual([[0.0, 0.0]], sess.run(x))
+
+  def test_converted_call_whitelisted_method_extra_self(self):
+
+    opts = converter.ConversionOptions()
+
+    model = sequential.Sequential([
+        core.Dense(2)
+    ])
+
+    x = api.converted_call(model.call, None, opts,
+                           model, constant_op.constant([[0.0]]), training=True)
+
+    with self.cached_session() as sess:
+      sess.run(variables.global_variables_initializer())
+      self.assertAllEqual([[0.0, 0.0]], sess.run(x))
+
+  def test_converted_call_whitelisted_method_via_owner(self):
+
+    opts = converter.ConversionOptions()
+
+    model = sequential.Sequential([
+        core.Dense(2)
+    ])
+
+    x = api.converted_call('call', model, opts,
+                           constant_op.constant([[0.0]]), training=True)
+
+    with self.cached_session() as sess:
+      sess.run(variables.global_variables_initializer())
+      self.assertAllEqual([[0.0, 0.0]], sess.run(x))
+
+  def test_converted_call_lambda(self):
+
+    opts = converter.ConversionOptions()
+
+    l = lambda x: x == 0
+
+    x = api.converted_call(l, None, opts, constant_op.constant(0))
+
+    with self.cached_session() as sess:
+      sess.run(variables.global_variables_initializer())
+      self.assertAllEqual(True, sess.run(x))
+
   def test_to_graph_basic(self):
 
     def test_fn(x, s):
diff --git a/tensorflow/python/autograph/impl/conversion.py b/tensorflow/python/autograph/impl/conversion.py
index ee09b2718ea..328a4b5fe48 100644
--- a/tensorflow/python/autograph/impl/conversion.py
+++ b/tensorflow/python/autograph/impl/conversion.py
@@ -45,6 +45,7 @@ from tensorflow.python.autograph.core import config
 from tensorflow.python.autograph.core import converter
 from tensorflow.python.autograph.core import errors
 from tensorflow.python.autograph.core import function_wrapping
+from tensorflow.python.autograph.lang import special_functions
 from tensorflow.python.autograph.pyct import ast_util
 from tensorflow.python.autograph.pyct import compiler
 from tensorflow.python.autograph.pyct import inspect_utils
@@ -108,21 +109,13 @@ def entity_to_graph(o, program_ctx, arg_values, arg_types):
   Raises:
     ValueError: if the entity type is not supported.
   """
-  if program_ctx.options.verbose:
+  if program_ctx.options.verbose == converter.Verbosity.VERBOSE:
     logging.info('Converting {}'.format(o))
 
   if tf_inspect.isclass(o):
     node, name, ns = class_to_graph(o, program_ctx)
   elif tf_inspect.isfunction(o):
-    # TODO(mdan): This is not a reliable mechanism.
-    # The most reliable way is to check the source code, the AST will contain
-    # a Lambda node instead of a FunctionDef
-    if o.__name__ == '<lambda>':
-      raise NotImplementedError(
-          'lambda functions are not yet supported; declare the function'
-          ' using def instead: %s' % o)
-    else:
-      node, name, ns = function_to_graph(o, program_ctx, arg_values, arg_types)
+    node, name, ns = function_to_graph(o, program_ctx, arg_values, arg_types)
   elif tf_inspect.ismethod(o):
     node, name, ns = function_to_graph(o, program_ctx, arg_values, arg_types)
   # TODO(mdan,yashkatariya): Remove when object conversion is implemented.
@@ -151,7 +144,7 @@ def entity_to_graph(o, program_ctx, arg_values, arg_types):
 
   program_ctx.add_to_cache(o, node)
 
-  if program_ctx.options.verbose:
+  if program_ctx.options.verbose == converter.Verbosity.VERBOSE:
     logging.info('Compiled output of {}:\n\n{}\n'.format(
         o, compiler.ast_to_source(node)))
 
@@ -192,8 +185,7 @@ def class_to_graph(c, program_ctx):
         program_ctx=program_ctx,
         arg_values={},
         arg_types={'self': (c.__name__, c)},
-        owner_type=c,
-        rewrite_errors=False)
+        owner_type=c)
     if class_namespace is None:
       class_namespace = namespace
     else:
@@ -273,6 +265,7 @@ def _add_self_references(namespace, autograph_module):
     # TODO(mdan): Add safeguards against name clashes.
     # We don't want to create a submodule because we want the operators to be
     # accessible as ag__.<operator>
+    ag_internal.__dict__.update(special_functions.__dict__)
     ag_internal.__dict__.update(operators.__dict__)
 
   _add_reserved_symbol(namespace, 'ag__', ag_internal)
@@ -282,12 +275,33 @@ def function_to_graph(f,
                       program_ctx,
                       arg_values,
                       arg_types,
-                      owner_type=None,
-                      rewrite_errors=True):
+                      owner_type=None):
   """Specialization of `entity_to_graph` for callable functions."""
 
   node, source = parser.parse_entity(f)
   node = node.body[0]
+
+  # In general, the output of inspect.getsource is inexact because it uses crude
+  # regex matching methods to search the source file. This is particularly
+  # problematic for lambda functions, where the entire containing lines are
+  # returned. Certain distributions of CPython may also return the enclosing
+  # function for local functions.
+  nodes = ast_util.find_matching_definitions(node, f)
+  if len(nodes) != 1:
+    if f.__name__ == '<lambda>':
+      raise ValueError(
+          'Unable to identify source code of lambda function {}. It was'
+          ' defined on this line: {}, which must contain a single lambda with'
+          ' matching signature. To avoid ambiguity, define each lambda'
+          ' in a separate expression.'.format(f, source))
+    else:
+      raise ValueError(
+          'Unable to identify source code of function {}. The source code'
+          ' reported by Python did not include exactly one matching signature:'
+          '\n{}\nTo avoid ambiguity, use a unique name for each'
+          ' function.'.format(f, source))
+  node, = nodes
+
   # TODO(znado): Place inside standard_analysis.
   origin_info.resolve(node, source, f)
   namespace = inspect_utils.getnamespace(f)
@@ -302,15 +316,22 @@ def function_to_graph(f,
       arg_types=arg_types,
       owner_type=owner_type)
   context = converter.EntityContext(namer, entity_info, program_ctx)
-  node = node_to_graph(node, context, rewrite_errors=rewrite_errors)
+  node = node_to_graph(node, context)
 
-  # TODO(mdan): This somewhat duplicates the call rename logic in call_trees.py
-  new_name, did_rename = namer.compiled_function_name(f.__name__, f, owner_type)
-  if not did_rename:
-    new_name = f.__name__
-    if node.name != f.__name__:
-      raise NotImplementedError('Strange corner case. Send us offending code!')
-  node.name = new_name
+  if isinstance(node, gast.Lambda):
+    new_name = namer.new_symbol('tf__lambda', ())
+    node = gast.Assign(
+        targets=[gast.Name(new_name, gast.Store(), None)], value=node)
+
+  else:
+    # TODO(mdan): This somewhat duplicates the renaming logic in call_trees.py
+    new_name, did_rename = namer.compiled_function_name(f.__name__, f,
+                                                        owner_type)
+    if did_rename:
+      node.name = new_name
+    else:
+      new_name = f.__name__
+      assert node.name == new_name
 
   program_ctx.update_name_map(namer)
   # TODO(mdan): Use this at compilation.
@@ -318,13 +339,12 @@ def function_to_graph(f,
   return [node], new_name, namespace
 
 
-def node_to_graph(node, context, rewrite_errors=True):
+def node_to_graph(node, context):
   """Convert Python code to equivalent TF graph mode code.
 
   Args:
     node: AST, the code to convert.
     context: converter.EntityContext
-    rewrite_errors: Boolean, whether or not to rewrite the error traceback.
 
   Returns:
     A tuple (node, deps):
@@ -361,7 +381,9 @@ def node_to_graph(node, context, rewrite_errors=True):
   node = converter.apply_(node, context, logical_expressions)
   if context.program.options.uses(converter.Feature.AUTO_CONTROL_DEPS):
     node = converter.apply_(node, context, side_effect_guards)
-  node = converter.apply_(node, context, function_scopes)
-  if rewrite_errors:
+  # TODO(mdan): If function scopes ever does more, the toggle will need moving.
+  if context.program.options.uses(converter.Feature.NAME_SCOPES):
+    node = converter.apply_(node, context, function_scopes)
+  if context.program.options.uses(converter.Feature.ERROR_REWRITING):
     node = converter.apply_(node, context, error_handlers)
   return node
diff --git a/tensorflow/python/autograph/impl/conversion_test.py b/tensorflow/python/autograph/impl/conversion_test.py
index 442d0e31e38..9a4fbdad8c1 100644
--- a/tensorflow/python/autograph/impl/conversion_test.py
+++ b/tensorflow/python/autograph/impl/conversion_test.py
@@ -160,12 +160,67 @@ class ConversionTest(test.TestCase):
                      program_ctx.dependency_cache[TestSubclass][-2].name)
 
   def test_entity_to_graph_lambda(self):
-    f = lambda a: a
+    b = 2
+    f = lambda x: b * x if x > 0 else -x
 
-    with self.assertRaises(NotImplementedError):
-      program_ctx = self._simple_program_ctx()
+    program_ctx = self._simple_program_ctx()
+    nodes, name, ns = conversion.entity_to_graph(f, program_ctx, None, None)
+    fn_node, _ = nodes
+    self.assertIsInstance(fn_node, gast.Assign)
+    self.assertIsInstance(fn_node.value, gast.Lambda)
+    self.assertEqual('tf__lambda', name)
+    self.assertIs(ns['b'], b)
+
+  def test_entity_to_graph_multiple_lambdas(self):
+    a, b = 1, 2
+    f, _ = (lambda x: a * x, lambda y: b * y)
+
+    program_ctx = self._simple_program_ctx()
+    nodes, name, ns = conversion.entity_to_graph(f, program_ctx, None, None)
+    fn_node, _ = nodes
+    self.assertIsInstance(fn_node, gast.Assign)
+    self.assertIsInstance(fn_node.value, gast.Lambda)
+    self.assertEqual('tf__lambda', name)
+    self.assertIs(ns['a'], a)
+
+  def test_entity_to_graph_multiple_lambdas_ambiguous_definitions(self):
+    a, b = 1, 2
+    f, _ = (lambda x: a * x, lambda x: b * x)
+
+    program_ctx = self._simple_program_ctx()
+    with self.assertRaises(ValueError):
       conversion.entity_to_graph(f, program_ctx, None, None)
 
+  def test_entity_to_graph_lambda_code_with_garbage(self):
+    # pylint:disable=g-long-lambda
+    f = (  # intentional wrap
+        lambda x: (x  # intentional wrap
+                   + 1),)[0]
+    # pylint:enable=g-long-lambda
+
+    program_ctx = self._simple_program_ctx()
+    nodes, name, _ = conversion.entity_to_graph(f, program_ctx, None, None)
+    fn_node, _ = nodes
+    self.assertIsInstance(fn_node, gast.Assign)
+    self.assertIsInstance(fn_node.value, gast.Lambda)
+    self.assertEqual('tf__lambda', name)
+
+  def test_entity_to_graph_nested_functions(self):
+    b = 2
+
+    def f(x):
+      def g(x):
+        return b * x
+      return g(x)
+
+    program_ctx = self._simple_program_ctx()
+    nodes, name, ns = conversion.entity_to_graph(f, program_ctx, None, None)
+    fn_node, _ = nodes
+    self.assertIsInstance(fn_node, gast.FunctionDef)
+    self.assertEqual(fn_node.name, 'tf__f')
+    self.assertEqual('tf__f', name)
+    self.assertIs(ns['b'], b)
+
   def test_ag_module_cached(self):
     def callee():
       return range(3)
diff --git a/tensorflow/python/autograph/lang/special_functions.py b/tensorflow/python/autograph/lang/special_functions.py
index 62ac018ac46..411770692b0 100644
--- a/tensorflow/python/autograph/lang/special_functions.py
+++ b/tensorflow/python/autograph/lang/special_functions.py
@@ -24,6 +24,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.autograph.operators import data_structures
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import tensor_util
 
 
@@ -46,6 +47,13 @@ def _validate_list_constructor(elements, element_dtype, element_shape):
       ' allowed'.format(type(elements)))
 
 
+def match_staging_level(value, like_value):
+  """Casts a value to be staged at the same level as another."""
+  if tensor_util.is_tensor(like_value):
+    return constant_op.constant(value)
+  return value
+
+
 def tensor_list(elements,
                 element_dtype=None,
                 element_shape=None,
diff --git a/tensorflow/python/autograph/lang/special_functions_test.py b/tensorflow/python/autograph/lang/special_functions_test.py
index 206a32d07cd..123ee65b326 100644
--- a/tensorflow/python/autograph/lang/special_functions_test.py
+++ b/tensorflow/python/autograph/lang/special_functions_test.py
@@ -30,26 +30,35 @@ from tensorflow.python.platform import test
 
 class SpecialFunctionsTest(test.TestCase):
 
+  def test_match_staging_level(self):
+    some_tensor = constant_op.constant(0)
+    tensor_one = special_functions.match_staging_level(1, some_tensor)
+    python_one = special_functions.match_staging_level(1, 1)
+    with self.cached_session() as sess:
+      self.assertTrue(tensor_util.is_tensor(tensor_one))
+      self.assertAllEqual(sess.run(tensor_one), 1)
+      self.assertEqual(python_one, 1)
+
   def test_tensor_list_empty_list(self):
     l = special_functions.tensor_list([],
                                       element_dtype=dtypes.int32,
                                       element_shape=())
     sl = list_ops.tensor_list_stack(l, element_dtype=dtypes.int32)
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       self.assertAllEqual(sess.run(sl), [])
 
     l = special_functions.tensor_list((),
                                       element_dtype=dtypes.int32,
                                       element_shape=())
     sl = list_ops.tensor_list_stack(l, element_dtype=dtypes.int32)
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       self.assertAllEqual(sess.run(sl), [])
 
   def test_tensor_list_tensor(self):
     l = special_functions.tensor_list(
         constant_op.constant([], dtype=dtypes.int32))
     sl = list_ops.tensor_list_stack(l, element_dtype=dtypes.int32)
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       self.assertAllEqual(sess.run(sl), [])
 
   def test_tensor_list_unsupported_initializer(self):
@@ -66,7 +75,7 @@ class SpecialFunctionsTest(test.TestCase):
 
     l = special_functions.tensor_list(elements)
     sl = list_ops.tensor_list_stack(l, element_dtype=dtypes.int32)
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       self.assertAllEqual(sess.run(sl), [[1, 2], [3, 4]])
 
   def test_tensor_list_array_from_elements(self):
@@ -74,7 +83,7 @@ class SpecialFunctionsTest(test.TestCase):
 
     l = special_functions.tensor_list(elements, use_tensor_array=True)
     sl = l.stack()
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       self.assertAllEqual(sess.run(sl), [[1, 2], [3, 4]])
 
   def test_stack(self):
diff --git a/tensorflow/python/autograph/operators/py_builtins.py b/tensorflow/python/autograph/operators/py_builtins.py
index d312e6938b2..2f55d538924 100644
--- a/tensorflow/python/autograph/operators/py_builtins.py
+++ b/tensorflow/python/autograph/operators/py_builtins.py
@@ -216,10 +216,10 @@ def _py_range(start_or_stop, stop, step):
   return range(start_or_stop)
 
 
-SUPPORTED_BUILTINS = set((abs, float, int, len, print, range))
+SUPPORTED_BUILTINS = (abs, float, int, len, print, range)
 
 if six.PY2:
-  SUPPORTED_BUILTINS.add(xrange)
+  SUPPORTED_BUILTINS += (xrange,)
 
 BUILTIN_FUINCTIONS_MAP = {
     'abs': abs_,
diff --git a/tensorflow/python/autograph/pyct/ast_util.py b/tensorflow/python/autograph/pyct/ast_util.py
index 7df3b8858c0..ea7eca6463a 100644
--- a/tensorflow/python/autograph/pyct/ast_util.py
+++ b/tensorflow/python/autograph/pyct/ast_util.py
@@ -24,6 +24,7 @@ import gast
 
 from tensorflow.python.autograph.pyct import anno
 from tensorflow.python.autograph.pyct import parser
+from tensorflow.python.util import tf_inspect
 
 
 class CleanCopier(object):
@@ -311,3 +312,64 @@ def parallel_walk(node, other):
         raise ValueError(
             'inconsistent values for field {}: {} and {}'.format(
                 f, n_child, o_child))
+
+
+class FunctionDefMatcher(gast.NodeVisitor):
+  """Finds nodes that match a given function's signature."""
+
+  def __init__(self, fn):
+    self.fn = fn
+    self.matching_nodes = []
+
+  def _arg_name(self, node):
+    if node is None:
+      return None
+    if isinstance(node, gast.Name):
+      return node.id
+    assert isinstance(node, str)
+    return node
+
+  def _argspec_matches(self, node):
+    arg_spec = tf_inspect.getfullargspec(self.fn)
+
+    node_args = tuple(self._arg_name(arg) for arg in node.args.args)
+    if node_args != tuple(arg_spec.args):
+      return False
+
+    if arg_spec.varargs != self._arg_name(node.args.vararg):
+      return False
+
+    if arg_spec.varkw != self._arg_name(node.args.kwarg):
+      return False
+
+    node_kwonlyargs = tuple(self._arg_name(arg) for arg in node.args.kwonlyargs)
+    if node_kwonlyargs != tuple(arg_spec.kwonlyargs):
+      return False
+
+    return True
+
+  def visit_Lambda(self, node):
+    self.generic_visit(node)
+
+    if self.fn.__name__ != '<lambda>':
+      return
+    if not self._argspec_matches(node):
+      return
+
+    self.matching_nodes.append(node)
+
+  def visit_FunctionDef(self, node):
+    self.generic_visit(node)
+
+    if self.fn.__name__ != node.name:
+      return
+    if not self._argspec_matches(node):
+      return
+
+    self.matching_nodes.append(node)
+
+
+def find_matching_definitions(node, f):
+  matcher = FunctionDefMatcher(f)
+  matcher.visit(node)
+  return tuple(matcher.matching_nodes)
diff --git a/tensorflow/python/autograph/pyct/ast_util_test.py b/tensorflow/python/autograph/pyct/ast_util_test.py
index b1577c466e6..9fcbbe646c6 100644
--- a/tensorflow/python/autograph/pyct/ast_util_test.py
+++ b/tensorflow/python/autograph/pyct/ast_util_test.py
@@ -22,6 +22,8 @@ import ast
 import collections
 import textwrap
 
+import gast
+
 from tensorflow.python.autograph.pyct import anno
 from tensorflow.python.autograph.pyct import ast_util
 from tensorflow.python.autograph.pyct import compiler
@@ -191,6 +193,107 @@ class AstUtilTest(test.TestCase):
       for _ in ast_util.parallel_walk(node_1, node_3):
         pass
 
+  def assertLambdaNodes(self, matching_nodes, expected_bodies):
+    self.assertEqual(len(matching_nodes), len(expected_bodies))
+    for node in matching_nodes:
+      self.assertIsInstance(node, gast.Lambda)
+      self.assertIn(compiler.ast_to_source(node.body).strip(), expected_bodies)
+
+  def test_find_matching_definitions_lambda(self):
+    node = parser.parse_str(
+        textwrap.dedent("""
+      f = lambda x: 1
+    """))
+    f = lambda x: x
+    nodes = ast_util.find_matching_definitions(node, f)
+    self.assertLambdaNodes(nodes, ('(1)',))
+
+  def test_find_matching_definitions_lambda_multiple_matches(self):
+    node = parser.parse_str(
+        textwrap.dedent("""
+      f = lambda x: 1, lambda x: 2
+    """))
+    f = lambda x: x
+    nodes = ast_util.find_matching_definitions(node, f)
+    self.assertLambdaNodes(nodes, ('(1)', '(2)'))
+
+  def test_find_matching_definitions_lambda_uses_arg_names(self):
+    node = parser.parse_str(
+        textwrap.dedent("""
+      f = lambda x: 1, lambda y: 2
+    """))
+    f = lambda x: x
+    nodes = ast_util.find_matching_definitions(node, f)
+    self.assertLambdaNodes(nodes, ('(1)',))
+
+    f = lambda y: y
+    nodes = ast_util.find_matching_definitions(node, f)
+    self.assertLambdaNodes(nodes, ('(2)',))
+
+  def assertFunctionDefNodes(self, matching_nodes, expected_bodies):
+    self.assertEqual(len(matching_nodes), len(expected_bodies))
+    for node in matching_nodes:
+      self.assertIsInstance(node, gast.FunctionDef)
+      self.assertIn(compiler.ast_to_source(node.body).strip(), expected_bodies)
+
+  def test_find_matching_definitions_function(self):
+    node = parser.parse_str(
+        textwrap.dedent("""
+      def f(x):
+        return 1
+    """))
+
+    def f(x):
+      return x
+
+    nodes = ast_util.find_matching_definitions(node, f)
+    self.assertFunctionDefNodes(nodes, ('return 1',))
+
+  def test_find_matching_definitions_nested_functions_same_name(self):
+    node = parser.parse_str(
+        textwrap.dedent("""
+      def f(x, *args, **kwargs):
+        def f(x, y):
+          return 1
+        return 2
+    """))
+
+    def f(x, y):
+      return x + y
+
+    nodes = ast_util.find_matching_definitions(node, f)
+    self.assertFunctionDefNodes(nodes, ('return 1',))
+
+  def test_find_matching_definitions_nested_functions_same_args(self):
+    node = parser.parse_str(
+        textwrap.dedent("""
+      def g(x):
+        def f(x):
+          return 1
+        return 2
+    """))
+
+    def f(x):
+      return x
+
+    nodes = ast_util.find_matching_definitions(node, f)
+    self.assertFunctionDefNodes(nodes, ('return 1',))
+
+  def test_find_matching_definitions_multiple_matches(self):
+    node = parser.parse_str(
+        textwrap.dedent("""
+      def f(x):
+        return 1
+      def f(x):
+        return 2
+    """))
+
+    def f(x):
+      return x
+
+    nodes = ast_util.find_matching_definitions(node, f)
+    self.assertFunctionDefNodes(nodes, ('return 1', 'return 2'))
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/autograph/pyct/compiler.py b/tensorflow/python/autograph/pyct/compiler.py
index 21281aeb561..06e66c5b587 100644
--- a/tensorflow/python/autograph/pyct/compiler.py
+++ b/tensorflow/python/autograph/pyct/compiler.py
@@ -123,26 +123,15 @@ def ast_to_object(nodes,
   compiled_nodes = imp.load_source(module_name, f.name)
 
   # TODO(znado): Clean this up so we don't need to attach it to the namespace.
-  # TODO(znado): This does not work for classes because their methods share a
-  # namespace.
-  # This attaches the source map which is needed for error handling.  Note that
-  # api.to_graph copies this source map into an attribute of the function.
-  #
-  # We need this so the ag_source_map__ variable is available to the call to
-  # rewrite_graph_construction_error in the except block inside each function
-  # that handles graph construction errors.
-  #
   # We cannot get the rewritten function name until it is too late so templating
-  # is hard, and this cleanly fixes the
-  # issues encountered with nested functions because this is attached to the
-  # outermost one.
+  # is hard, and this cleanly fixes the issues encountered with nested functions
+  # because this is attached to the outermost one.
   if include_source_map:
     # TODO(mdan): This name should be decided by the caller.
     source_map_name = 'ag_source_map__'
-    if source_map_name in compiled_nodes.__dict__:
-      raise ValueError('cannot convert %s because is has namespace attribute '
-                       '"%s", which is reserved for AutoGraph.' %
-                       (compiled_nodes, source_map_name))
+    assert source_map_name not in compiled_nodes.__dict__, (
+        'cannot convert %s because is has namespace attribute "%s", which is '
+        'reserved for AutoGraph.') % (compiled_nodes, source_map_name)
     compiled_nodes.__dict__[source_map_name] = source_map
 
   return compiled_nodes, source
diff --git a/tensorflow/python/autograph/pyct/inspect_utils.py b/tensorflow/python/autograph/pyct/inspect_utils.py
index e078cd56a21..4d56b93671e 100644
--- a/tensorflow/python/autograph/pyct/inspect_utils.py
+++ b/tensorflow/python/autograph/pyct/inspect_utils.py
@@ -185,12 +185,15 @@ def getmethodclass(m):
       return m.__class__
 
   # Instance method and class methods: should be bound to a non-null "self".
-  # If self is a class, then it's a class method.
   if hasattr(m, '__self__'):
-    if m.__self__:
-      if tf_inspect.isclass(m.__self__):
-        return m.__self__
-      return type(m.__self__)
+    if m.__self__ is not None:
+      # A fallback allowing methods to be actually bound to a type different
+      # than __self__. This is useful when a strong reference from the method
+      # to the object is not desired, for example when caching is involved.
+      if hasattr(m.__self__, 'ag_self_weakref__'):
+        return m.__self__.ag_self_weakref__()
+
+      return m.__self__
 
   # Class, static and unbound methods: search all defined classes in any
   # namespace. This is inefficient but more robust method.
diff --git a/tensorflow/python/autograph/pyct/inspect_utils_test.py b/tensorflow/python/autograph/pyct/inspect_utils_test.py
index 7e79b3b9f68..622e3bafc0a 100644
--- a/tensorflow/python/autograph/pyct/inspect_utils_test.py
+++ b/tensorflow/python/autograph/pyct/inspect_utils_test.py
@@ -20,11 +20,14 @@ from __future__ import print_function
 
 from functools import wraps
 import imp
+import types
+import weakref
 
 import six
 
 from tensorflow.python import lib
 from tensorflow.python.autograph.pyct import inspect_utils
+from tensorflow.python.framework import constant_op
 from tensorflow.python.platform import test
 
 
@@ -184,16 +187,16 @@ class InspectUtilsTest(test.TestCase):
     test_obj = TestClass()
     self.assertEqual(
         inspect_utils.getmethodclass(test_obj.member_function),
-        TestClass)
+        test_obj)
     self.assertEqual(
         inspect_utils.getmethodclass(test_obj.decorated_member),
-        TestClass)
+        test_obj)
     self.assertEqual(
         inspect_utils.getmethodclass(test_obj.fn_decorated_member),
-        TestClass)
+        test_obj)
     self.assertEqual(
         inspect_utils.getmethodclass(test_obj.wrap_decorated_member),
-        TestClass)
+        test_obj)
     self.assertEqual(
         inspect_utils.getmethodclass(test_obj.static_method),
         TestClass)
@@ -242,16 +245,16 @@ class InspectUtilsTest(test.TestCase):
     test_obj = LocalClass()
     self.assertEqual(
         inspect_utils.getmethodclass(test_obj.member_function),
-        LocalClass)
+        test_obj)
     self.assertEqual(
         inspect_utils.getmethodclass(test_obj.decorated_member),
-        LocalClass)
+        test_obj)
     self.assertEqual(
         inspect_utils.getmethodclass(test_obj.fn_decorated_member),
-        LocalClass)
+        test_obj)
     self.assertEqual(
         inspect_utils.getmethodclass(test_obj.wrap_decorated_member),
-        LocalClass)
+        test_obj)
 
   def test_getmethodclass_callables(self):
     class TestCallable(object):
@@ -262,6 +265,25 @@ class InspectUtilsTest(test.TestCase):
     c = TestCallable()
     self.assertEqual(inspect_utils.getmethodclass(c), TestCallable)
 
+  def test_getmethodclass_weakref_mechanism(self):
+    test_obj = TestClass()
+
+    class WeakrefWrapper(object):
+
+      def __init__(self):
+        self.ag_self_weakref__ = weakref.ref(test_obj)
+
+    def test_fn(self):
+      return self
+
+    bound_method = types.MethodType(test_fn, WeakrefWrapper())
+    self.assertEqual(inspect_utils.getmethodclass(bound_method), test_obj)
+
+  def test_getmethodclass_no_bool_conversion(self):
+
+    tensor = constant_op.constant([1])
+    self.assertEqual(inspect_utils.getmethodclass(tensor.get_shape), tensor)
+
   def test_getdefiningclass(self):
     class Superclass(object):
 
diff --git a/tensorflow/python/autograph/pyct/parser.py b/tensorflow/python/autograph/pyct/parser.py
index 63686350d51..8f4037c5e28 100644
--- a/tensorflow/python/autograph/pyct/parser.py
+++ b/tensorflow/python/autograph/pyct/parser.py
@@ -31,21 +31,61 @@ from tensorflow.python.util import tf_inspect
 def parse_entity(entity):
   """Returns the AST of given entity."""
   source = tf_inspect.getsource(entity)
+
+  def fail(comment):
+    raise ValueError(
+        'Failed to parse source code of {}, which Python reported as:\n{}\n'
+        '{}'.format(entity, source, comment))
+
   # Comments and multiline strings can appear at arbitrary indentation levels,
   # causing textwrap.dedent to not correctly dedent source code.
   # TODO(b/115884650): Automatic handling of comments/multiline strings.
   source = textwrap.dedent(source)
+
   try:
     return parse_str(source), source
+
   except IndentationError:
-    # Because we are parsing the source code of entities that have already
-    # successfully parsed once, any IndentationErrors are guaranteed to be
-    # caused by insufficient dedenting.
-    raise ValueError(
-        'Failed to dedent prior to parsing source code. If you have comments '
-        'or multiline strings in your code, try indenting them. '
-        'Multiline strings can be rewritten using textwrap.dedent.\n'
-        'Offending source code: \n %s' % source)
+    # The text below lists the causes of this error known to us. There may
+    # be more.
+    fail('This may be caused by multiline strings or comments not indented at'
+         'the same level as the code.')
+
+  except SyntaxError as e:
+    if not tf_inspect.isfunction(entity) or entity.__name__ != '<lambda>':
+      raise
+
+    # Certain entities, like lambdas, only hold the raw code lines which defined
+    # them, which may include surrounding tokens and may be syntactically
+    # invalid out of context. For example:
+    #
+    #     l = (
+    #         lambda x: x,)[0]
+    #
+    # will have the dedented source "lambda x: x,)[0]"
+    # Here we make an attempt to stip away the garbage by looking at the
+    # information in the syntax error.
+    lines = source.split('\n')
+    lineno, offset = e.lineno, e.offset  # 1-based
+
+    # Give up if there's nothing we can chip away.
+    if len(lines) == lineno and len(lines[-1]) == offset:
+      fail('If this is a lambda function, the error may be avoided by creating'
+           ' the lambda in a standalone statement.')
+
+    # Drop all lines following the error location
+    # TODO(mdan): What's with the pylint errors?
+    lines = lines[:lineno]  # pylint:disable=invalid-slice-index
+    # Drop all characters following the error location
+    lines[-1] = lines[-1][:offset - 1]  # pylint:disable=invalid-slice-index
+    new_source = '\n'.join(lines)
+
+    try:
+      return parse_str(new_source), new_source
+    except SyntaxError as e:
+      fail('If this is a lambda function, the error may be avoided by creating'
+           ' the lambda in a standalone statement. Tried to strip down the'
+           ' source to:\n{}\nBut that did not work.'.format(new_source))
 
 
 def parse_str(src):
diff --git a/tensorflow/python/autograph/pyct/static_analysis/liveness.py b/tensorflow/python/autograph/pyct/static_analysis/liveness.py
index ad11057a0b0..451398f1b70 100644
--- a/tensorflow/python/autograph/pyct/static_analysis/liveness.py
+++ b/tensorflow/python/autograph/pyct/static_analysis/liveness.py
@@ -198,6 +198,13 @@ class Annotator(transformer.Base):
     node = self._block_statement_live_out(node)
     return self._block_statement_live_in(node, node.test)
 
+  def visit_Expr(self, node):
+    node = self.generic_visit(node)
+    cfg_node = self.current_analyzer.graph.index[node]
+    anno.setanno(node, anno.Static.LIVE_VARS_OUT,
+                 frozenset(self.current_analyzer.out[cfg_node]))
+    return node
+
 
 def resolve(node, source_info, graphs):
   """Resolves the live symbols at the exit of control flow statements.
diff --git a/tensorflow/python/autograph/pyct/transformer.py b/tensorflow/python/autograph/pyct/transformer.py
index 3b6a446340a..b6830534b3d 100644
--- a/tensorflow/python/autograph/pyct/transformer.py
+++ b/tensorflow/python/autograph/pyct/transformer.py
@@ -26,6 +26,7 @@ import six
 from tensorflow.python.autograph.pyct import anno
 from tensorflow.python.autograph.pyct import compiler
 from tensorflow.python.autograph.pyct import pretty_printer
+from tensorflow.python.autograph.pyct import templates
 
 
 class AutographParseError(SyntaxError):
@@ -280,6 +281,12 @@ class Base(gast.NodeTransformer):
       print(pretty_printer.fmt(node))
     return node
 
+  def create_assignment(self, target, expression):
+    template = """
+      target = expression
+    """
+    return templates.replace(template, target=target, expression=expression)
+
   def visit_block(self, nodes, before_visit=None, after_visit=None):
     """A more powerful version of generic_visit for statement blocks.
 
@@ -316,13 +323,14 @@ class Base(gast.NodeTransformer):
     Args:
       nodes: enumerable of AST node objects. If None, the function returns None.
       before_visit: optional callable that is called before visiting each item
-          in nodes
-      after_visit: optional callable that takes in an AST node and
-          returns a tuple (new_node, new_destination). It is called after
-          visiting each item in nodes. Is used in the same was as the
+        in nodes
+      after_visit: optional callable that takes in an AST node and returns a
+        tuple (new_node, new_destination). It is called after visiting each item
+        in nodes. Is used in the same was as the
           visit_* methods: new_node will replace the node; if not None,
-          new_destination must be a list, and subsequent nodes will be placed
-          in this list instead of the list returned by visit_block.
+            new_destination must be a list, and subsequent nodes will be placed
+            in this list instead of the list returned by visit_block.
+
     Returns:
       A list of AST node objects containing the transformed items fron nodes,
       except those nodes that have been relocated using after_visit.
diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index c2f3f32fd2e..385fd431f4c 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -26,7 +26,7 @@ import datetime
 from tensorflow.python.util import tf_contextlib
 from tensorflow.python.util.tf_export import tf_export
 
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2018, 11, 4)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2018, 11, 13)
 
 
 @tf_export("compat.forward_compatible")
diff --git a/tensorflow/python/data/benchmarks/range_benchmark.py b/tensorflow/python/data/benchmarks/range_benchmark.py
index 4166aa61f4d..25f63b79a26 100644
--- a/tensorflow/python/data/benchmarks/range_benchmark.py
+++ b/tensorflow/python/data/benchmarks/range_benchmark.py
@@ -29,14 +29,16 @@ _NUMPY_RANDOM_SEED = 42
 class RangeBenchmark(test.Benchmark):
   """Benchmarks for `tf.data.Dataset.range()`."""
 
-  def benchmarkRange(self):
-    num_elements = 50000000
+  def _benchmarkRangeHelper(self, modeling_enabled):
+    num_elements = 10000000 if modeling_enabled else 50000000
+    options = dataset_ops.Options()
+    options.experimental_autotune = modeling_enabled
 
     # Use `Dataset.skip()` and `Dataset.take()` to perform the iteration in
     # C++, and focus on the minimal overheads (excluding Python invocation
     # costs).
     dataset = dataset_ops.Dataset.range(num_elements).skip(
-        num_elements - 1).take(1)
+        num_elements - 1).take(1).with_options(options)
     iterator = dataset.make_initializable_iterator()
     next_element = iterator.get_next()
 
@@ -52,10 +54,15 @@ class RangeBenchmark(test.Benchmark):
       end = time.time()
 
       time_per_element = (end - start) / num_elements
-      print("Average time per element: %f nanoseconds" % (
-          time_per_element * 1e9))
+      print("Average time per element (%s modeling): %f nanoseconds" % (
+          "with" if modeling_enabled else "without", time_per_element * 1e9))
       self.report_benchmark(iters=num_elements, wall_time=time_per_element,
-                            name="benchmark_tf_data_dataset_range")
+                            name="benchmark_tf_data_dataset_range%s"
+                            % ("_with_modeling" if modeling_enabled else ""))
+
+  def benchmarkRange(self):
+    for modeling_enabled in [False, True]:
+      self._benchmarkRangeHelper(modeling_enabled)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/__init__.py b/tensorflow/python/data/experimental/__init__.py
index d4e7fee9219..126c2be4420 100644
--- a/tensorflow/python/data/experimental/__init__.py
+++ b/tensorflow/python/data/experimental/__init__.py
@@ -29,6 +29,8 @@ See [Importing Data](https://tensorflow.org/guide/datasets) for an overview.
 @@RandomDataset
 @@Reducer
 @@SqlDataset
+@@StatsAggregator
+@@StatsOptions
 @@TFRecordWriter
 
 @@bucket_by_sequence_length
@@ -52,9 +54,7 @@ See [Importing Data](https://tensorflow.org/guide/datasets) for an overview.
 @@rejection_resample
 @@sample_from_datasets
 @@scan
-@@set_stats_aggregator
 @@shuffle_and_repeat
-@@StatsAggregator
 @@unbatch
 @@unique
 
@@ -98,9 +98,9 @@ from tensorflow.python.data.experimental.ops.readers import SqlDataset
 from tensorflow.python.data.experimental.ops.resampling import rejection_resample
 from tensorflow.python.data.experimental.ops.scan_ops import scan
 from tensorflow.python.data.experimental.ops.shuffle_ops import shuffle_and_repeat
+from tensorflow.python.data.experimental.ops.stats_aggregator import StatsAggregator
 from tensorflow.python.data.experimental.ops.stats_ops import latency_stats
-from tensorflow.python.data.experimental.ops.stats_ops import set_stats_aggregator
-from tensorflow.python.data.experimental.ops.stats_ops import StatsAggregator
+from tensorflow.python.data.experimental.ops.stats_options import StatsOptions
 from tensorflow.python.data.experimental.ops.unique import unique
 from tensorflow.python.data.experimental.ops.writers import TFRecordWriter
 from tensorflow.python.data.ops.iterator_ops import get_next_as_optional
diff --git a/tensorflow/python/data/experimental/kernel_tests/BUILD b/tensorflow/python/data/experimental/kernel_tests/BUILD
index a1382f75982..c9b11a2c381 100644
--- a/tensorflow/python/data/experimental/kernel_tests/BUILD
+++ b/tensorflow/python/data/experimental/kernel_tests/BUILD
@@ -38,6 +38,7 @@ cuda_py_test(
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
         "//tensorflow/python/compat:compat",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:iterator_ops",
@@ -279,6 +280,7 @@ py_test(
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
         "//tensorflow/python:framework_ops",
+        "//tensorflow/python:io_ops",
         "//tensorflow/python:parsing_ops",
         "//tensorflow/python/data/experimental/ops:readers",
         "//tensorflow/python/data/ops:readers",
@@ -624,9 +626,15 @@ py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:errors",
         "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python/data/experimental/ops:batching",
+        "//tensorflow/python/data/experimental/ops:optimization",
+        "//tensorflow/python/data/experimental/ops:stats_aggregator",
         "//tensorflow/python/data/experimental/ops:stats_ops",
+        "//tensorflow/python/data/experimental/ops:stats_options",
         "//tensorflow/python/data/ops:dataset_ops",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
diff --git a/tensorflow/python/data/experimental/kernel_tests/copy_to_device_test.py b/tensorflow/python/data/experimental/kernel_tests/copy_to_device_test.py
index adfacf1c9f8..cea8bd6f0b7 100644
--- a/tensorflow/python/data/experimental/kernel_tests/copy_to_device_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/copy_to_device_test.py
@@ -28,7 +28,9 @@ from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import test_util
+from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
+from tensorflow.python.util import compat as util_compat
 
 
 class CopyToDeviceTest(test_base.DatasetTestBase):
@@ -294,6 +296,42 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(next_element)
 
+  def testCopyToDeviceGpuWithMap(self):
+    if not test_util.is_gpu_available():
+      self.skipTest("No GPU available")
+
+    def generator():
+      for i in range(10):
+        yield i, float(i), str(i)
+
+    host_dataset = dataset_ops.Dataset.from_generator(
+        generator, output_types=(dtypes.int32, dtypes.float32, dtypes.string))
+    device_dataset = host_dataset.apply(
+        prefetching_ops.copy_to_device("/gpu:0"))
+
+    def gpu_map_func(x, y, z):
+      return math_ops.square(x), math_ops.square(y), z
+
+    device_dataset = device_dataset.apply(
+        prefetching_ops.map_on_gpu(gpu_map_func))
+    options = dataset_ops.Options()
+    options.experimental_autotune = False
+    device_dataset = device_dataset.with_options(options)
+
+    with ops.device("/gpu:0"):
+      iterator = device_dataset.make_initializable_iterator()
+      next_element = iterator.get_next()
+
+    with self.cached_session() as sess:
+      sess.run(iterator.initializer)
+      for i in range(10):
+        x, y, z = sess.run(next_element)
+        self.assertEqual(i**2, x)
+        self.assertEqual(float(i**2), y)
+        self.assertEqual(util_compat.as_bytes(str(i)), z)
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(next_element)
+
   def testCopyToDeviceGpuInt32(self):
     if not test_util.is_gpu_available():
       self.skipTest("No GPU available")
diff --git a/tensorflow/python/data/experimental/kernel_tests/make_batched_features_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/make_batched_features_dataset_test.py
index 5ee94e14dcd..91ae8cb1bd2 100644
--- a/tensorflow/python/data/experimental/kernel_tests/make_batched_features_dataset_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/make_batched_features_dataset_test.py
@@ -20,11 +20,13 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.data.experimental.kernel_tests import reader_dataset_ops_test_base
+from tensorflow.python.data.experimental.ops import readers
 from tensorflow.python.data.ops import readers as core_readers
 from tensorflow.python.data.util import nest
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
+from tensorflow.python.ops import io_ops
 from tensorflow.python.ops import parsing_ops
 from tensorflow.python.platform import test
 
@@ -234,6 +236,20 @@ class MakeBatchedFeaturesDatasetTest(
       if issubclass(clazz, ops.Tensor):
         self.assertEqual(32, shape[0])
 
+  def testOldStyleReader(self):
+    with self.assertRaisesRegexp(
+        TypeError, r"The `reader` argument must return a `Dataset` object. "
+        r"`tf.ReaderBase` subclasses are not supported."):
+      _ = readers.make_batched_features_dataset(
+          file_pattern=self.test_filenames[0], batch_size=32,
+          features={
+              "file": parsing_ops.FixedLenFeature([], dtypes.int64),
+              "record": parsing_ops.FixedLenFeature([], dtypes.int64),
+              "keywords": parsing_ops.VarLenFeature(dtypes.string),
+              "label": parsing_ops.FixedLenFeature([], dtypes.string),
+          },
+          reader=io_ops.TFRecordReader)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/BUILD b/tensorflow/python/data/experimental/kernel_tests/optimization/BUILD
index 5b75e54f66c..9946ef5a42f 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/BUILD
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/BUILD
@@ -89,6 +89,7 @@ py_test(
         "//tensorflow/python:errors",
         "//tensorflow/python/data/experimental/kernel_tests:stats_dataset_test_base",
         "//tensorflow/python/data/experimental/ops:optimization",
+        "//tensorflow/python/data/experimental/ops:stats_aggregator",
         "//tensorflow/python/data/experimental/ops:stats_ops",
         "//tensorflow/python/data/ops:dataset_ops",
     ],
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/latency_all_edges_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/latency_all_edges_test.py
index 469b05399a1..7144d834f9f 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/latency_all_edges_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/latency_all_edges_test.py
@@ -19,7 +19,8 @@ from __future__ import print_function
 
 from tensorflow.python.data.experimental.kernel_tests import stats_dataset_test_base
 from tensorflow.python.data.experimental.ops import optimization
-from tensorflow.python.data.experimental.ops import stats_ops
+from tensorflow.python.data.experimental.ops import stats_aggregator
+from tensorflow.python.data.experimental.ops import stats_options
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import errors
 from tensorflow.python.platform import test
@@ -28,18 +29,45 @@ from tensorflow.python.platform import test
 class LatencyAllEdgesTest(stats_dataset_test_base.StatsDatasetTestBase):
 
   def testLatencyStatsOptimization(self):
-    stats_aggregator = stats_ops.StatsAggregator()
+    aggregator = stats_aggregator.StatsAggregator()
     dataset = dataset_ops.Dataset.from_tensors(1).apply(
         optimization.assert_next(
             ["LatencyStats", "Map", "LatencyStats", "Prefetch",
-             "LatencyStats"])).map(lambda x: x * x).prefetch(1).apply(
-                 stats_ops.set_stats_aggregator(stats_aggregator))
+             "LatencyStats"])).map(lambda x: x * x).prefetch(1)
     options = dataset_ops.Options()
-    options.experimental_latency_all_edges = True
+    options.experimental_stats = stats_options.StatsOptions()
+    options.experimental_stats.latency_all_edges = True
+    options.experimental_stats.aggregator = aggregator
     dataset = dataset.with_options(options)
     iterator = dataset.make_initializable_iterator()
     get_next = iterator.get_next()
-    summary_t = stats_aggregator.get_summary()
+    summary_t = aggregator.get_summary()
+
+    with self.cached_session() as sess:
+      sess.run(iterator.initializer)
+      self.assertEqual(1 * 1, sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+      summary_str = sess.run(summary_t)
+      self._assertSummaryHasCount(summary_str,
+                                  "record_latency_TensorDataset/_1", 1)
+      self._assertSummaryHasCount(summary_str, "record_latency_MapDataset/_4",
+                                  1)
+      self._assertSummaryHasCount(summary_str,
+                                  "record_latency_PrefetchDataset/_6", 1)
+
+  def testLatencyStatsOptimizationV2(self):
+    aggregator = stats_aggregator.StatsAggregator()
+    dataset = dataset_ops.Dataset.from_tensors(1).apply(
+        optimization.assert_next(
+            ["LatencyStats", "Map", "LatencyStats", "Prefetch",
+             "LatencyStats"])).map(lambda x: x * x).prefetch(1)
+    options = dataset_ops.Options()
+    options.experimental_stats = stats_options.StatsOptions(aggregator)
+    dataset = dataset.with_options(options)
+    iterator = dataset.make_initializable_iterator()
+    get_next = iterator.get_next()
+    summary_t = aggregator.get_summary()
 
     with self.cached_session() as sess:
       sess.run(iterator.initializer)
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/optimize_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/optimize_dataset_test.py
index bd263ee658f..739b6a9bf4c 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/optimize_dataset_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/optimize_dataset_test.py
@@ -121,6 +121,19 @@ class OptimizeDatasetTest(test_base.DatasetTestBase):
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
+  def testOptimizationNonSerializableAsDirectInput(self):
+    """Tests that non-serializable dataset can be OptimizeDataset's input.
+    """
+    dataset = dataset_ops.Dataset.from_tensors(0)
+    dataset = dataset.apply(optimization.non_serializable())
+    dataset = dataset_ops._OptimizeDataset(dataset, ["noop_elimination"])
+    iterator = dataset.make_one_shot_iterator()
+    get_next = iterator.get_next()
+    with self.cached_session() as sess:
+      self.assertEquals(0, sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/parse_example_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/parse_example_dataset_test.py
index f9ea4c3b545..c74f754fefb 100644
--- a/tensorflow/python/data/experimental/kernel_tests/parse_example_dataset_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/parse_example_dataset_test.py
@@ -27,7 +27,6 @@ from tensorflow.core.example import feature_pb2
 from tensorflow.python.data.experimental.ops import parsing_ops as contrib_parsing_ops
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.data.util import nest
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
@@ -35,7 +34,6 @@ from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import parsing_ops
 from tensorflow.python.platform import test
-from tensorflow.python.platform import tf_logging
 
 # Helpers for creating Example objects
 example = example_pb2.Example
@@ -50,33 +48,20 @@ feature_lists = lambda d: feature_pb2.FeatureLists(feature_list=d)
 sequence_example = example_pb2.SequenceExample
 
 
-def _compare_output_to_expected(tester, dict_tensors, expected_tensors,
-                                flat_output):
-  tester.assertEqual(set(dict_tensors.keys()), set(expected_tensors.keys()))
-
-  i = 0  # Index into the flattened output
-  for k, v in sorted(dict_tensors.items()):
-    # TODO(shivaniagrawal): flat_output is same as v.
-    expected_v = expected_tensors[k]
-    tf_logging.info("Comparing key: %s", k)
-    print("i", i, "flat_output", flat_output[i], "expected_v", expected_v)
-    if sparse_tensor.is_sparse(v):
-      # Three outputs for SparseTensor : indices, values, shape.
-      tester.assertEqual([k, len(expected_v)], [k, 3])
-      print("i", i, "flat_output", flat_output[i].indices, "expected_v",
-            expected_v[0])
-      tester.assertAllEqual(expected_v[0], flat_output[i].indices)
-      tester.assertAllEqual(expected_v[1], flat_output[i].values)
-      tester.assertAllEqual(expected_v[2], flat_output[i].dense_shape)
-    else:
-      # One output for standard Tensor.
-      tester.assertAllEqual(expected_v, flat_output[i])
-    i += 1
-
-
 @test_util.run_all_in_graph_and_eager_modes
 class ParseExampleDatasetTest(test_base.DatasetTestBase):
 
+  def _compare_output_to_expected(self, dict_tensors, expected_tensors):
+    self.assertEqual(set(dict_tensors.keys()), set(expected_tensors.keys()))
+
+    for k, v in sorted(dict_tensors.items()):
+      expected_v = expected_tensors[k]
+      if sparse_tensor.is_sparse(v):
+        self.assertSparseValuesEqual(expected_v, v)
+      else:
+        # One output for standard Tensor.
+        self.assertAllEqual(expected_v, v)
+
   def _test(self,
             input_tensor,
             feature_val,
@@ -99,26 +84,29 @@ class ParseExampleDatasetTest(test_base.DatasetTestBase):
           contrib_parsing_ops.parse_example_dataset(feature_val))
       get_next = self.getNext(dataset)
       result = self.evaluate(get_next())
-      flattened = nest.flatten(result)
-      _compare_output_to_expected(self, result, expected_values, flattened)
+      self._compare_output_to_expected(result, expected_values)
+      with self.assertRaises(errors_impl.OutOfRangeError):
+        self.evaluate(get_next())
+      with self.assertRaises(errors_impl.OutOfRangeError):
+        self.evaluate(get_next())
       if create_iterator_twice:
         get_next = self.getNext(dataset)
         result = self.evaluate(get_next())
-        flattened = nest.flatten(result)
-        _compare_output_to_expected(self, result, expected_values, flattened)
+        self._compare_output_to_expected(result, expected_values)
+        with self.assertRaises(errors_impl.OutOfRangeError):
+          self.evaluate(get_next())
     # Check shapes; if serialized is a Tensor we need its size to
     # properly check.
     batch_size = (
         self.evaluate(input_tensor).size if isinstance(input_tensor, ops.Tensor)
         else np.asarray(input_tensor).size)
     for k, f in feature_val.items():
-      print("output_shapes as list ", tuple(dataset.output_shapes[k].as_list()))
       if isinstance(f, parsing_ops.FixedLenFeature) and f.shape is not None:
         self.assertEqual(dataset.output_shapes[k].as_list()[0], batch_size)
       elif isinstance(f, parsing_ops.VarLenFeature):
         self.assertEqual(dataset.output_shapes[k].as_list()[1], None)
 
-  def testSkipEagerEmptySerializedWithAllDefaults(self):
+  def testEmptySerializedWithAllDefaults(self):
     sparse_name = "st_a"
     a_name = "a"
     b_name = "b"
@@ -127,13 +115,10 @@ class ParseExampleDatasetTest(test_base.DatasetTestBase):
     b_default = np.random.rand(3, 3).astype(bytes)
     c_default = np.random.rand(2).astype(np.float32)
 
-    expected_st_a = (  # indices, values, shape
-        np.empty(
-            (0, 2), dtype=np.int64),  # indices
-        np.empty(
-            (0,), dtype=np.int64),  # sp_a is DT_INT64
-        np.array(
-            [2, 0], dtype=np.int64))  # batch == 2, max_elems = 0
+    expected_st_a = sparse_tensor.SparseTensorValue(  # indices, values, shape
+        np.empty((0, 2), dtype=np.int64),  # indices
+        np.empty((0,), dtype=np.int64),  # sp_a is DT_INT64
+        np.array([2, 0], dtype=np.int64))  # batch == 2, max_elems = 0
 
     expected_output = {
         sparse_name: expected_st_a,
@@ -219,7 +204,7 @@ class ParseExampleDatasetTest(test_base.DatasetTestBase):
         {"a": parsing_ops.FixedLenFeature(None, dtypes.float32)},
         expected_err=(ValueError, "Missing shape for feature a"))
 
-  def testSkipEagerSerializedContainingSparse(self):
+  def testSerializedContainingSparse(self):
     original = [
         example(features=features({
             "st_c": float_feature([3, 4])
@@ -238,17 +223,14 @@ class ParseExampleDatasetTest(test_base.DatasetTestBase):
 
     serialized = [m.SerializeToString() for m in original]
 
-    expected_st_c = (  # indices, values, shape
-        np.array(
-            [[0, 0], [0, 1], [3, 0], [3, 1], [3, 2]], dtype=np.int64), np.array(
-                [3.0, 4.0, 1.0, 2.0, -1.0], dtype=np.float32), np.array(
-                    [4, 3], dtype=np.int64))  # batch == 2, max_elems = 3
+    expected_st_c = sparse_tensor.SparseTensorValue(  # indices, values, shape
+        np.array([[0, 0], [0, 1], [3, 0], [3, 1], [3, 2]], dtype=np.int64),
+        np.array([3.0, 4.0, 1.0, 2.0, -1.0], dtype=np.float32),
+        np.array([4, 3], dtype=np.int64))  # batch == 2, max_elems = 3
 
-    expected_st_d = (  # indices, values, shape
-        np.array(
-            [[3, 0]], dtype=np.int64), np.array(
-                ["hi"], dtype=bytes), np.array(
-                    [4, 1], dtype=np.int64))  # batch == 2, max_elems = 1
+    expected_st_d = sparse_tensor.SparseTensorValue(  # indices, values, shape
+        np.array([[3, 0]], dtype=np.int64), np.array(["hi"], dtype=bytes),
+        np.array([4, 1], dtype=np.int64))  # batch == 2, max_elems = 1
 
     expected_output = {
         "st_c": expected_st_c,
@@ -263,7 +245,7 @@ class ParseExampleDatasetTest(test_base.DatasetTestBase):
         expected_values=expected_output,
         create_iterator_twice=True)
 
-  def testSkipEagerSerializedContainingSparseFeature(self):
+  def testSerializedContainingSparseFeature(self):
     original = [
         example(features=features({
             "val": float_feature([3, 4]),
@@ -286,12 +268,10 @@ class ParseExampleDatasetTest(test_base.DatasetTestBase):
 
     serialized = [m.SerializeToString() for m in original]
 
-    expected_sp = (  # indices, values, shape
-        np.array(
-            [[0, 5], [0, 10], [3, 0], [3, 3], [3, 9]], dtype=np.int64),
-        np.array(
-            [3.0, 4.0, 1.0, -1.0, 2.0], dtype=np.float32), np.array(
-                [4, 13], dtype=np.int64))  # batch == 4, max_elems = 13
+    expected_sp = sparse_tensor.SparseTensorValue(  # indices, values, shape
+        np.array([[0, 5], [0, 10], [3, 0], [3, 3], [3, 9]], dtype=np.int64),
+        np.array([3.0, 4.0, 1.0, -1.0, 2.0], dtype=np.float32),
+        np.array([4, 13], dtype=np.int64))  # batch == 4, max_elems = 13
 
     expected_output = {"sp": expected_sp,}
 
@@ -301,7 +281,7 @@ class ParseExampleDatasetTest(test_base.DatasetTestBase):
         expected_values=expected_output,
         create_iterator_twice=True)
 
-  def testSkipEagerSerializedContainingSparseFeatureReuse(self):
+  def testSerializedContainingSparseFeatureReuse(self):
     original = [
         example(features=features({
             "val1": float_feature([3, 4]),
@@ -316,17 +296,15 @@ class ParseExampleDatasetTest(test_base.DatasetTestBase):
 
     serialized = [m.SerializeToString() for m in original]
 
-    expected_sp1 = (  # indices, values, shape
-        np.array(
-            [[0, 5], [0, 10]], dtype=np.int64), np.array(
-                [3.0, 4.0], dtype=np.float32), np.array(
-                    [2, 13], dtype=np.int64))  # batch == 2, max_elems = 13
+    expected_sp1 = sparse_tensor.SparseTensorValue(  # indices, values, shape
+        np.array([[0, 5], [0, 10]], dtype=np.int64),
+        np.array([3.0, 4.0], dtype=np.float32),
+        np.array([2, 13], dtype=np.int64))  # batch == 2, max_elems = 13
 
-    expected_sp2 = (  # indices, values, shape
-        np.array(
-            [[0, 5], [0, 10]], dtype=np.int64), np.array(
-                [5.0, 6.0], dtype=np.float32), np.array(
-                    [2, 7], dtype=np.int64))  # batch == 2, max_elems = 13
+    expected_sp2 = sparse_tensor.SparseTensorValue(  # indices, values, shape
+        np.array([[0, 5], [0, 10]], dtype=np.int64),
+        np.array([5.0, 6.0], dtype=np.float32),
+        np.array([2, 7], dtype=np.int64))  # batch == 2, max_elems = 13
 
     expected_output = {
         "sp1": expected_sp1,
@@ -344,7 +322,7 @@ class ParseExampleDatasetTest(test_base.DatasetTestBase):
         expected_values=expected_output,
         create_iterator_twice=True)
 
-  def testSkipEagerSerializedContaining3DSparseFeature(self):
+  def testSerializedContaining3DSparseFeature(self):
     original = [
         example(features=features({
             "val": float_feature([3, 4]),
@@ -369,11 +347,10 @@ class ParseExampleDatasetTest(test_base.DatasetTestBase):
 
     serialized = [m.SerializeToString() for m in original]
 
-    expected_sp = (
+    expected_sp = sparse_tensor.SparseTensorValue(
         # indices
-        np.array(
-            [[0, 5, 0], [0, 10, 2], [3, 0, 1], [3, 3, 2], [3, 9, 0]],
-            dtype=np.int64),
+        np.array([[0, 5, 0], [0, 10, 2], [3, 0, 1], [3, 3, 2], [3, 9, 0]],
+                 dtype=np.int64),
         # values
         np.array([3.0, 4.0, 1.0, -1.0, 2.0], dtype=np.float32),
         # shape batch == 4, max_elems = 13
@@ -534,20 +511,15 @@ class ParseExampleDatasetTest(test_base.DatasetTestBase):
         expected_values=expected_output,
         create_iterator_twice=True)
 
-  def testSkipEagerSerializedSparseAndSparseFeatureAndDenseWithNoDefault(
-      self):
-    expected_st_a = (  # indices, values, shape
-        np.empty(
-            (0, 2), dtype=np.int64),  # indices
-        np.empty(
-            (0,), dtype=np.int64),  # sp_a is DT_INT64
-        np.array(
-            [2, 0], dtype=np.int64))  # batch == 2, max_elems = 0
-    expected_sp = (  # indices, values, shape
-        np.array(
-            [[0, 0], [0, 3], [1, 7]], dtype=np.int64), np.array(
-                ["a", "b", "c"], dtype="|S"), np.array(
-                    [2, 13], dtype=np.int64))  # batch == 4, max_elems = 13
+  def testSerializedSparseAndSparseFeatureAndDenseWithNoDefault(self):
+    expected_st_a = sparse_tensor.SparseTensorValue(  # indices, values, shape
+        np.empty((0, 2), dtype=np.int64),  # indices
+        np.empty((0,), dtype=np.int64),  # sp_a is DT_INT64
+        np.array([2, 0], dtype=np.int64))  # batch == 2, max_elems = 0
+    expected_sp = sparse_tensor.SparseTensorValue(  # indices, values, shape
+        np.array([[0, 0], [0, 3], [1, 7]], dtype=np.int64),
+        np.array(["a", "b", "c"], dtype="|S"),
+        np.array([2, 13], dtype=np.int64))  # batch == 4, max_elems = 13
 
     original = [
         example(features=features({
@@ -594,18 +566,16 @@ class ParseExampleDatasetTest(test_base.DatasetTestBase):
         expected_values=expected_output,
         create_iterator_twice=True)
 
-  def testSkipEagererializedContainingSparseAndSparseFeatureWithReuse(self):
-    expected_idx = (  # indices, values, shape
-        np.array(
-            [[0, 0], [0, 1], [1, 0], [1, 1]], dtype=np.int64),
-        np.array([0, 3, 7, 1]), np.array(
-            [2, 2], dtype=np.int64))  # batch == 4, max_elems = 2
+  def testerializedContainingSparseAndSparseFeatureWithReuse(self):
+    expected_idx = sparse_tensor.SparseTensorValue(  # indices, values, shape
+        np.array([[0, 0], [0, 1], [1, 0], [1, 1]], dtype=np.int64),
+        np.array([0, 3, 7, 1]),
+        np.array([2, 2], dtype=np.int64))  # batch == 4, max_elems = 2
 
-    expected_sp = (  # indices, values, shape
-        np.array(
-            [[0, 0], [0, 3], [1, 1], [1, 7]], dtype=np.int64), np.array(
-                ["a", "b", "d", "c"], dtype="|S"), np.array(
-                    [2, 13], dtype=np.int64))  # batch == 4, max_elems = 13
+    expected_sp = sparse_tensor.SparseTensorValue(  # indices, values, shape
+        np.array([[0, 0], [0, 3], [1, 1], [1, 7]], dtype=np.int64),
+        np.array(["a", "b", "d", "c"], dtype="|S"),
+        np.array([2, 13], dtype=np.int64))  # batch == 4, max_elems = 13
 
     original = [
         example(features=features({
@@ -694,16 +664,15 @@ class ParseExampleDatasetTest(test_base.DatasetTestBase):
         expected_values=expected_output,
         create_iterator_twice=True)
 
-  def testSkipEagerSerializedContainingVarLenDenseLargerBatch(self):
+  def testSerializedContainingVarLenDenseLargerBatch(self):
     np.random.seed(3456)
     for batch_size in (1, 10, 20, 100, 256):
       self._testSerializedContainingVarLenDenseLargerBatch(batch_size)
 
-  def testSkipEagerSerializedContainingVarLenDense(self):
+  def testSkipEagerSerializedShapeMismatch(self):
     aname = "a"
     bname = "b"
     cname = "c"
-    dname = "d"
     original = [
         example(features=features({
             cname: int64_feature([2]),
@@ -722,6 +691,47 @@ class ParseExampleDatasetTest(test_base.DatasetTestBase):
         })),
     ]
 
+    serialized = [m.SerializeToString() for m in original]
+    self._test(
+        ops.convert_to_tensor(serialized), {
+            aname:
+                parsing_ops.FixedLenSequenceFeature((2, 1),
+                                                    dtype=dtypes.float32,
+                                                    allow_missing=True,
+                                                    default_value=[]),
+            bname:
+                parsing_ops.FixedLenSequenceFeature(
+                    (2, 1, 1), dtype=dtypes.string, allow_missing=True),
+        },
+        expected_err=(ValueError,
+                      "Cannot reshape a tensor with 0 elements to shape"))
+
+  def testSerializedContainingVarLenDense(self):
+    aname = "a"
+    bname = "b"
+    cname = "c"
+    dname = "d"
+    original = [
+        example(features=features({
+            cname: int64_feature([2]),
+        })),
+        example(
+            features=features({
+                aname: float_feature([1, 1]),
+                bname: bytes_feature([b"b0_str", b"b1_str"]),
+            })),
+        example(
+            features=features({
+                aname: float_feature([-1, -1, 2, 2]),
+                bname: bytes_feature([b"b1"]),
+            })),
+        example(
+            features=features({
+                aname: float_feature([]),
+                cname: int64_feature([3]),
+            })),
+    ]
+
     serialized = [m.SerializeToString() for m in original]
 
     expected_output = {
@@ -807,21 +817,6 @@ class ParseExampleDatasetTest(test_base.DatasetTestBase):
             errors_impl.OpError, "Key: b, Index: 2.  "
             "Number of bytes values is not a multiple of stride length."))
 
-    self._test(
-        ops.convert_to_tensor(serialized), {
-            aname:
-                parsing_ops.FixedLenSequenceFeature(
-                    (2, 1),
-                    dtype=dtypes.float32,
-                    allow_missing=True,
-                    default_value=[]),
-            bname:
-                parsing_ops.FixedLenSequenceFeature(
-                    (2, 1, 1), dtype=dtypes.string, allow_missing=True),
-        },
-        expected_err=(ValueError,
-                      "Cannot reshape a tensor with 0 elements to shape"))
-
     self._test(
         ops.convert_to_tensor(serialized), {
             aname:
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/BUILD b/tensorflow/python/data/experimental/kernel_tests/serialization/BUILD
index 66bc3833a73..2cfb5759036 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/BUILD
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/BUILD
@@ -651,6 +651,7 @@ py_test(
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_ops",
+        "//tensorflow/python/data/experimental/ops:stats_aggregator",
         "//tensorflow/python/data/experimental/ops:stats_ops",
         "//tensorflow/python/data/ops:dataset_ops",
     ],
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/stats_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/stats_dataset_serialization_test.py
index ef7061b1904..662d768b489 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/stats_dataset_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/stats_dataset_serialization_test.py
@@ -18,6 +18,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.data.experimental.kernel_tests.serialization import dataset_serialization_test_base
+from tensorflow.python.data.experimental.ops import stats_aggregator
 from tensorflow.python.data.experimental.ops import stats_ops
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import errors
@@ -92,9 +93,9 @@ class StatsDatasetSerializationTest(
         None, num_outputs)
 
   def _build_dataset_stats_aggregator(self):
-    stats_aggregator = stats_ops.StatsAggregator()
+    aggregator = stats_aggregator.StatsAggregator()
     return dataset_ops.Dataset.range(10).apply(
-        stats_ops.set_stats_aggregator(stats_aggregator))
+        stats_ops.set_stats_aggregator(aggregator))
 
   def test_set_stats_aggregator_not_support_checkpointing(self):
     with self.assertRaisesRegexp(errors.UnimplementedError,
diff --git a/tensorflow/python/data/experimental/kernel_tests/stats_dataset_ops_test.py b/tensorflow/python/data/experimental/kernel_tests/stats_dataset_ops_test.py
index 4d794b4b845..83028937d36 100644
--- a/tensorflow/python/data/experimental/kernel_tests/stats_dataset_ops_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/stats_dataset_ops_test.py
@@ -17,13 +17,16 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python.data.experimental.kernel_tests import reader_dataset_ops_test_base
 from tensorflow.python.data.experimental.kernel_tests import stats_dataset_test_base
 from tensorflow.python.data.experimental.ops import batching
 from tensorflow.python.data.experimental.ops import optimization
+from tensorflow.python.data.experimental.ops import stats_aggregator
 from tensorflow.python.data.experimental.ops import stats_ops
+from tensorflow.python.data.experimental.ops import stats_options
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
@@ -32,17 +35,43 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
 
+def function_set_stats_aggregator(dataset,
+                                  aggregator,
+                                  prefix="",
+                                  counter_prefix=""):
+  return dataset.apply(
+      stats_ops.set_stats_aggregator(aggregator, prefix, counter_prefix))
+
+
+def function_apply_options(dataset, aggregator, prefix="", counter_prefix=""):
+  options = dataset_ops.Options()
+  options.experimental_stats = stats_options.StatsOptions(aggregator)
+  options.experimental_stats.latency_all_edges = False
+  if prefix:
+    options.experimental_stats.prefix = prefix
+  if counter_prefix:
+    options.experimental_stats.counter_prefix = counter_prefix
+  return dataset.with_options(options)
+
+
+@parameterized.named_parameters(
+    dict(
+        testcase_name="SetStatsAggregator",
+        dataset_transformation=function_set_stats_aggregator),
+    dict(
+        testcase_name="StatsOptions",
+        dataset_transformation=function_apply_options))
 class StatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase):
 
-  def testBytesProduced(self):
-    stats_aggregator = stats_ops.StatsAggregator()
+  def testBytesProduced(self, dataset_transformation):
+    aggregator = stats_aggregator.StatsAggregator()
     dataset = dataset_ops.Dataset.range(100).map(
         lambda x: array_ops.tile([x], ops.convert_to_tensor([x]))).apply(
-            stats_ops.bytes_produced_stats("bytes_produced")).apply(
-                stats_ops.set_stats_aggregator(stats_aggregator))
+            stats_ops.bytes_produced_stats("bytes_produced"))
+    dataset = dataset_transformation(dataset, aggregator)
     iterator = dataset.make_initializable_iterator()
     next_element = iterator.get_next()
-    summary_t = stats_aggregator.get_summary()
+    summary_t = aggregator.get_summary()
 
     with self.cached_session() as sess:
       sess.run(iterator.initializer)
@@ -60,14 +89,14 @@ class StatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase):
       self._assertSummaryHasCount(summary_str, "bytes_produced", 100.0)
       self._assertSummaryHasSum(summary_str, "bytes_produced", expected_sum)
 
-  def testLatencyStats(self):
-    stats_aggregator = stats_ops.StatsAggregator()
+  def testLatencyStats(self, dataset_transformation):
+    aggregator = stats_aggregator.StatsAggregator()
     dataset = dataset_ops.Dataset.range(100).apply(
-        stats_ops.latency_stats("record_latency")).apply(
-            stats_ops.set_stats_aggregator(stats_aggregator))
+        stats_ops.latency_stats("record_latency"))
+    dataset = dataset_transformation(dataset, aggregator)
     iterator = dataset.make_initializable_iterator()
     next_element = iterator.get_next()
-    summary_t = stats_aggregator.get_summary()
+    summary_t = aggregator.get_summary()
 
     with self.cached_session() as sess:
       sess.run(iterator.initializer)
@@ -79,14 +108,14 @@ class StatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase):
         sess.run(next_element)
       self._assertSummaryHasCount(sess.run(summary_t), "record_latency", 100.0)
 
-  def testPrefetchBufferUtilization(self):
-    stats_aggregator = stats_ops.StatsAggregator()
+  def testPrefetchBufferUtilization(self, dataset_transformation):
+    aggregator = stats_aggregator.StatsAggregator()
     dataset = dataset_ops.Dataset.range(100).map(
-        lambda x: array_ops.tile([x], ops.convert_to_tensor([x]))).prefetch(
-            -1).apply(stats_ops.set_stats_aggregator(stats_aggregator))
+        lambda x: array_ops.tile([x], ops.convert_to_tensor([x]))).prefetch(-1)
+    dataset = dataset_transformation(dataset, aggregator)
     iterator = dataset.make_initializable_iterator()
     next_element = iterator.get_next()
-    summary_t = stats_aggregator.get_summary()
+    summary_t = aggregator.get_summary()
 
     with self.cached_session() as sess:
       sess.run(iterator.initializer)
@@ -106,14 +135,14 @@ class StatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase):
       self._assertSummaryHasCount(summary_str, "Prefetch::buffer_utilization",
                                   100)
 
-  def testPrefetchBufferScalars(self):
-    stats_aggregator = stats_ops.StatsAggregator()
+  def testPrefetchBufferScalars(self, dataset_transformation):
+    aggregator = stats_aggregator.StatsAggregator()
     dataset = dataset_ops.Dataset.range(10).map(
-        lambda x: array_ops.tile([x], ops.convert_to_tensor([x]))).prefetch(
-            0).apply(stats_ops.set_stats_aggregator(stats_aggregator))
+        lambda x: array_ops.tile([x], ops.convert_to_tensor([x]))).prefetch(0)
+    dataset = dataset_transformation(dataset, aggregator)
     iterator = dataset.make_initializable_iterator()
     next_element = iterator.get_next()
-    summary_t = stats_aggregator.get_summary()
+    summary_t = aggregator.get_summary()
 
     with self.cached_session() as sess:
       sess.run(iterator.initializer)
@@ -128,14 +157,14 @@ class StatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase):
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(next_element)
 
-  def testFilteredElementsStats(self):
-    stats_aggregator = stats_ops.StatsAggregator()
+  def testFilteredElementsStats(self, dataset_transformation):
+    aggregator = stats_aggregator.StatsAggregator()
     dataset = dataset_ops.Dataset.range(101).filter(
-        lambda x: math_ops.equal(math_ops.mod(x, 3), 0)).apply(
-            stats_ops.set_stats_aggregator(stats_aggregator))
+        lambda x: math_ops.equal(math_ops.mod(x, 3), 0))
+    dataset = dataset_transformation(dataset, aggregator)
     iterator = dataset.make_initializable_iterator()
     next_element = iterator.get_next()
-    summary_t = stats_aggregator.get_summary()
+    summary_t = aggregator.get_summary()
 
     with self.test_session() as sess:
       sess.run(iterator.initializer)
@@ -153,7 +182,7 @@ class StatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase):
       self._assertSummaryHasScalarValue(
           sess.run(summary_t), "Filter::filtered_elements", 34.0)
 
-  def testMapBufferUtilization(self):
+  def testMapBufferUtilization(self, dataset_transformation):
 
     def dataset_fn():
       return dataset_ops.Dataset.range(10).map(
@@ -161,9 +190,13 @@ class StatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase):
           num_parallel_calls=4)
 
     self._testParallelCallsStats(
-        dataset_fn, "ParallelMap", 10, function_processing_time=True)
+        dataset_fn,
+        "ParallelMap",
+        10,
+        dataset_transformation,
+        function_processing_time=True)
 
-  def testMapAutoTuneBufferUtilization(self):
+  def testMapAutoTuneBufferUtilization(self, dataset_transformation):
 
     def dataset_fn():
       dataset = dataset_ops.Dataset.range(10).map(
@@ -174,9 +207,13 @@ class StatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase):
       return dataset.with_options(options)
 
     self._testParallelCallsStats(
-        dataset_fn, "ParallelMap", 10, function_processing_time=True)
+        dataset_fn,
+        "ParallelMap",
+        10,
+        dataset_transformation,
+        function_processing_time=True)
 
-  def testInterleaveAutoTuneBufferUtilization(self):
+  def testInterleaveAutoTuneBufferUtilization(self, dataset_transformation):
 
     def dataset_fn():
       dataset = dataset_ops.Dataset.range(10).map(
@@ -189,9 +226,10 @@ class StatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase):
       options.experimental_autotune = True
       return dataset.with_options(options)
 
-    self._testParallelCallsStats(dataset_fn, "ParallelInterleaveV2", 10)
+    self._testParallelCallsStats(dataset_fn, "ParallelInterleaveV2", 10,
+                                 dataset_transformation)
 
-  def testMapAndBatchAutoTuneBufferUtilization(self):
+  def testMapAndBatchAutoTuneBufferUtilization(self, dataset_transformation):
 
     def dataset_fn():
       dataset = dataset_ops.Dataset.range(100).apply(
@@ -208,17 +246,18 @@ class StatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase):
         dataset_fn,
         "MapAndBatch",
         num_output,
+        dataset_transformation,
         check_elements=False,
         function_processing_time=True)
 
-  def testReinitialize(self):
-    stats_aggregator = stats_ops.StatsAggregator()
+  def testReinitialize(self, dataset_transformation):
+    aggregator = stats_aggregator.StatsAggregator()
     dataset = dataset_ops.Dataset.range(100).apply(
-        stats_ops.latency_stats("record_latency")).apply(
-            stats_ops.set_stats_aggregator(stats_aggregator))
+        stats_ops.latency_stats("record_latency"))
+    dataset = dataset_transformation(dataset, aggregator)
     iterator = dataset.make_initializable_iterator()
     next_element = iterator.get_next()
-    summary_t = stats_aggregator.get_summary()
+    summary_t = aggregator.get_summary()
 
     with self.cached_session() as sess:
       for j in range(5):
@@ -232,7 +271,7 @@ class StatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase):
         self._assertSummaryHasCount(
             sess.run(summary_t), "record_latency", (j + 1) * 100.0)
 
-  def testNoAggregatorRegistered(self):
+  def testNoAggregatorRegistered(self, dataset_transformation):
     dataset = dataset_ops.Dataset.range(100).apply(
         stats_ops.latency_stats("record_latency"))
     iterator = dataset.make_initializable_iterator()
@@ -245,15 +284,15 @@ class StatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase):
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(next_element)
 
-  def testMultipleTags(self):
-    stats_aggregator = stats_ops.StatsAggregator()
+  def testMultipleTags(self, dataset_transformation):
+    aggregator = stats_aggregator.StatsAggregator()
     dataset = dataset_ops.Dataset.range(100).apply(
         stats_ops.latency_stats("record_latency")).apply(
-            stats_ops.latency_stats("record_latency_2")).apply(
-                stats_ops.set_stats_aggregator(stats_aggregator))
+            stats_ops.latency_stats("record_latency_2"))
+    dataset = dataset_transformation(dataset, aggregator)
     iterator = dataset.make_initializable_iterator()
     next_element = iterator.get_next()
-    summary_t = stats_aggregator.get_summary()
+    summary_t = aggregator.get_summary()
 
     with self.cached_session() as sess:
       sess.run(iterator.initializer)
@@ -269,15 +308,15 @@ class StatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase):
       self._assertSummaryHasCount(
           sess.run(summary_t), "record_latency_2", 100.0)
 
-  def testRepeatedTags(self):
-    stats_aggregator = stats_ops.StatsAggregator()
+  def testRepeatedTags(self, dataset_transformation):
+    aggregator = stats_aggregator.StatsAggregator()
     dataset = dataset_ops.Dataset.range(100).apply(
         stats_ops.latency_stats("record_latency")).apply(
-            stats_ops.latency_stats("record_latency")).apply(
-                stats_ops.set_stats_aggregator(stats_aggregator))
+            stats_ops.latency_stats("record_latency"))
+    dataset = dataset_transformation(dataset, aggregator)
     iterator = dataset.make_initializable_iterator()
     next_element = iterator.get_next()
-    summary_t = stats_aggregator.get_summary()
+    summary_t = aggregator.get_summary()
 
     with self.cached_session() as sess:
       sess.run(iterator.initializer)
@@ -289,15 +328,15 @@ class StatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase):
         sess.run(next_element)
       self._assertSummaryHasCount(sess.run(summary_t), "record_latency", 200.0)
 
-  def testMultipleIteratorsSameAggregator(self):
-    stats_aggregator = stats_ops.StatsAggregator()
+  def testMultipleIteratorsSameAggregator(self, dataset_transformation):
+    aggregator = stats_aggregator.StatsAggregator()
     dataset = dataset_ops.Dataset.range(100).apply(
-        stats_ops.latency_stats("record_latency")).apply(
-            stats_ops.set_stats_aggregator(stats_aggregator))
+        stats_ops.latency_stats("record_latency"))
+    dataset = dataset_transformation(dataset, aggregator)
     iterator_0 = dataset.make_initializable_iterator()
     iterator_1 = dataset.make_initializable_iterator()
     next_element = iterator_0.get_next() + iterator_1.get_next()
-    summary_t = stats_aggregator.get_summary()
+    summary_t = aggregator.get_summary()
 
     with self.cached_session() as sess:
       sess.run([iterator_0.initializer, iterator_1.initializer])
@@ -309,18 +348,18 @@ class StatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase):
         sess.run(next_element)
       self._assertSummaryHasCount(sess.run(summary_t), "record_latency", 200.0)
 
-  def testMultipleDatasetWithTags(self):
-    stats_aggregator = stats_ops.StatsAggregator()
+  def testMultipleDatasetWithPrefixes(self, dataset_transformation):
+    aggregator = stats_aggregator.StatsAggregator()
     dataset = dataset_ops.Dataset.range(100).apply(
-        stats_ops.latency_stats("record_latency")).apply(
-            stats_ops.set_stats_aggregator(stats_aggregator, "dataset1"))
+        stats_ops.latency_stats("record_latency"))
+    dataset = dataset_transformation(dataset, aggregator, prefix="dataset1")
     dataset2 = dataset_ops.Dataset.range(100).apply(
-        stats_ops.latency_stats("record_latency")).apply(
-            stats_ops.set_stats_aggregator(stats_aggregator, "dataset2"))
+        stats_ops.latency_stats("record_latency"))
+    dataset2 = dataset_transformation(dataset2, aggregator, prefix="dataset2")
     iterator_0 = dataset.make_initializable_iterator()
     iterator_1 = dataset2.make_initializable_iterator()
     next_element = iterator_0.get_next() + iterator_1.get_next()
-    summary_t = stats_aggregator.get_summary()
+    summary_t = aggregator.get_summary()
 
     with self.test_session() as sess:
       sess.run([iterator_0.initializer, iterator_1.initializer])
@@ -338,15 +377,22 @@ class StatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase):
           sess.run(summary_t), "dataset2_record_latency", 100.0)
 
 
+@parameterized.named_parameters(
+    dict(
+        testcase_name="SetStatsAggregator",
+        dataset_transformation=function_set_stats_aggregator),
+    dict(
+        testcase_name="StatsOptions",
+        dataset_transformation=function_apply_options))
 class FeatureStatsDatasetTest(
     stats_dataset_test_base.StatsDatasetTestBase,
     reader_dataset_ops_test_base.MakeBatchedFeaturesDatasetTestBase):
 
-  def testFeaturesStats(self):
+  def testFeaturesStats(self, dataset_transformation):
     num_epochs = 5
     total_records = num_epochs * self._num_records
     batch_size = 2
-    stats_aggregator = stats_ops.StatsAggregator()
+    aggregator = stats_aggregator.StatsAggregator()
 
     def dataset_fn():
       return self.make_batch_feature(
@@ -362,13 +408,17 @@ class FeatureStatsDatasetTest(
       num_output = total_records // batch_size + 1
 
     self._testParallelCallsStats(
-        dataset_fn, "ParseExample", num_output, check_elements=False)
+        dataset_fn,
+        "ParseExample",
+        num_output,
+        dataset_transformation,
+        check_elements=False)
 
-    iterator = dataset_fn().apply(
-        stats_ops.set_stats_aggregator(
-            stats_aggregator, "record_stats")).make_initializable_iterator()
+    dataset = dataset_transformation(
+        dataset_fn(), aggregator, prefix="record_stats")
+    iterator = dataset.make_initializable_iterator()
     next_element = iterator.get_next()
-    summary_t = stats_aggregator.get_summary()
+    summary_t = aggregator.get_summary()
 
     with self.test_session() as sess:
       sess.run(iterator.initializer)
diff --git a/tensorflow/python/data/experimental/kernel_tests/stats_dataset_test_base.py b/tensorflow/python/data/experimental/kernel_tests/stats_dataset_test_base.py
index a4e6242b00c..c5bf9267590 100644
--- a/tensorflow/python/data/experimental/kernel_tests/stats_dataset_test_base.py
+++ b/tensorflow/python/data/experimental/kernel_tests/stats_dataset_test_base.py
@@ -20,7 +20,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.core.framework import summary_pb2
-from tensorflow.python.data.experimental.ops import stats_ops
+from tensorflow.python.data.experimental.ops import stats_aggregator
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.framework import errors
 
@@ -87,14 +87,15 @@ class StatsDatasetTestBase(test_base.DatasetTestBase):
                               dataset_fn,
                               dataset_name,
                               num_output,
+                              dataset_transformation,
                               function_processing_time=False,
                               check_elements=True):
-    stats_aggregator = stats_ops.StatsAggregator()
-    dataset = dataset_fn().apply(
-        stats_ops.set_stats_aggregator(stats_aggregator))
+    aggregator = stats_aggregator.StatsAggregator()
+    dataset = dataset_fn()
+    dataset = dataset_transformation(dataset, aggregator)
     iterator = dataset.make_initializable_iterator()
     next_element = iterator.get_next()
-    summary_t = stats_aggregator.get_summary()
+    summary_t = aggregator.get_summary()
 
     with self.cached_session() as sess:
       sess.run(iterator.initializer)
diff --git a/tensorflow/python/data/experimental/ops/BUILD b/tensorflow/python/data/experimental/ops/BUILD
index 323298e33a6..170fda90b68 100644
--- a/tensorflow/python/data/experimental/ops/BUILD
+++ b/tensorflow/python/data/experimental/ops/BUILD
@@ -82,6 +82,7 @@ py_library(
         "//tensorflow/python:dtypes",
         "//tensorflow/python:experimental_dataset_ops_gen",
         "//tensorflow/python:framework_ops",
+        "//tensorflow/python:io_ops",
         "//tensorflow/python:lib",
         "//tensorflow/python:platform",
         "//tensorflow/python:tensor_shape",
@@ -271,6 +272,16 @@ py_library(
     ],
 )
 
+py_library(
+    name = "stats_aggregator",
+    srcs = ["stats_aggregator.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:dataset_ops_gen",
+        "//tensorflow/python:util",
+    ],
+)
+
 py_library(
     name = "stats_ops",
     srcs = ["stats_ops.py"],
@@ -286,6 +297,15 @@ py_library(
     ],
 )
 
+py_library(
+    name = "stats_options",
+    srcs = ["stats_options.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":stats_aggregator",
+    ],
+)
+
 py_library(
     name = "threadpool",
     srcs = ["threadpool.py"],
diff --git a/tensorflow/python/data/experimental/ops/map_defun.py b/tensorflow/python/data/experimental/ops/map_defun.py
index ec1a3adf0c1..5d729d392ac 100644
--- a/tensorflow/python/data/experimental/ops/map_defun.py
+++ b/tensorflow/python/data/experimental/ops/map_defun.py
@@ -52,7 +52,7 @@ def map_defun(fn, elems, output_dtypes, output_shapes):
     raise ValueError("`output_shapes` must be a list of `tf.TensorShape` "
                      "objects.")
 
-  concrete_fn = fn.get_concrete_function()
+  concrete_fn = fn._get_concrete_function_internal()  # pylint: disable=protected-access
   # TODO(shivaniagrawal/rachelim): what about functions created without
   # input_signature.
   elems = [ops.convert_to_tensor(e) for e in elems]
diff --git a/tensorflow/python/data/experimental/ops/prefetching_ops.py b/tensorflow/python/data/experimental/ops/prefetching_ops.py
index 2add95558d5..d34f9f25bda 100644
--- a/tensorflow/python/data/experimental/ops/prefetching_ops.py
+++ b/tensorflow/python/data/experimental/ops/prefetching_ops.py
@@ -138,7 +138,7 @@ class _PrefetchToDeviceIterator(object):
       ret = remote_iterator.get_next()
       return nest.flatten(sparse.serialize_sparse_tensors(ret))
 
-    self._prefetch_fn = _prefetch_fn.get_concrete_function()
+    self._prefetch_fn = _prefetch_fn._get_concrete_function_internal()  # pylint: disable=protected-access
 
     iterator_device = ged_ops.experimental_iterator_get_device(
         self._input_iterator._iterator_resource)
@@ -237,7 +237,7 @@ class _PrefetchToDeviceEagerIterator(iterator_ops.EagerIterator):
       ret = remote_iterator.get_next()
       return nest.flatten(sparse.serialize_sparse_tensors(ret))
 
-    self._prefetch_fn = _prefetch_fn.get_concrete_function()
+    self._prefetch_fn = _prefetch_fn._get_concrete_function_internal()  # pylint: disable=protected-access
 
     with ops.device(device):
       self._buffering_resource = function_buffering_resource(
@@ -422,7 +422,8 @@ class _CopyToDeviceDataset(dataset_ops.UnaryDataset):
           [gen_dataset_ops.make_iterator(ds_variant, resource)]):
         return gen_dataset_ops.iterator_to_string_handle(resource)
 
-    init_func_concrete = _init_func.get_concrete_function()
+    init_func_concrete = _init_func._get_concrete_function_internal()  # pylint: disable=protected-access
+
     @function.defun()
     def _remote_init_func():
       return functional_ops.remote_call(
@@ -431,7 +432,7 @@ class _CopyToDeviceDataset(dataset_ops.UnaryDataset):
           Tout=[dtypes.string],
           f=init_func_concrete)
 
-    self._init_func = _remote_init_func.get_concrete_function()
+    self._init_func = _remote_init_func._get_concrete_function_internal()  # pylint: disable=protected-access
     self._init_captured_args = self._init_func.captured_inputs
 
     @function.defun(input_signature=[tensor_spec.TensorSpec([], dtypes.string)])
@@ -450,7 +451,8 @@ class _CopyToDeviceDataset(dataset_ops.UnaryDataset):
       ret = iterator.get_next()
       return nest.flatten(sparse.serialize_sparse_tensors(ret))
 
-    next_func_concrete = _next_func.get_concrete_function()
+    next_func_concrete = _next_func._get_concrete_function_internal()  # pylint: disable=protected-access
+
     @function.defun(input_signature=[tensor_spec.TensorSpec([], dtypes.string)])
     def _remote_next_func(string_handle):
       return functional_ops.remote_call(
@@ -460,7 +462,7 @@ class _CopyToDeviceDataset(dataset_ops.UnaryDataset):
           Tout=self._flat_output_types,
           f=next_func_concrete)
 
-    self._next_func = _remote_next_func.get_concrete_function()
+    self._next_func = _remote_next_func._get_concrete_function_internal()  # pylint: disable=protected-access
     self._next_captured_args = self._next_func.captured_inputs
 
     @function.defun(input_signature=[tensor_spec.TensorSpec([], dtypes.string)])
@@ -481,7 +483,8 @@ class _CopyToDeviceDataset(dataset_ops.UnaryDataset):
               iterator_resource, ignore_lookup_error=True)]):
         return array_ops.constant(0, dtypes.int64)
 
-    finalize_func_concrete = _finalize_func.get_concrete_function()
+    finalize_func_concrete = _finalize_func._get_concrete_function_internal()  # pylint: disable=protected-access
+
     @function.defun(input_signature=[tensor_spec.TensorSpec([], dtypes.string)])
     def _remote_finalize_func(string_handle):
       return functional_ops.remote_call(
@@ -491,7 +494,8 @@ class _CopyToDeviceDataset(dataset_ops.UnaryDataset):
           Tout=[dtypes.int64],
           f=finalize_func_concrete)
 
-    self._finalize_func = _remote_finalize_func.get_concrete_function()
+    self._finalize_func = _remote_finalize_func._get_concrete_function_internal(  # pylint: disable=protected-access
+    )
     self._finalize_captured_args = self._finalize_func.captured_inputs
 
     g = ops.get_default_graph()
@@ -536,3 +540,71 @@ class _CopyToDeviceDataset(dataset_ops.UnaryDataset):
   @property
   def output_classes(self):
     return self._input_dataset.output_classes
+
+
+class _MapOnGpuDataset(dataset_ops.UnaryDataset):
+  """A `Dataset` that maps a function over elements in its using a GPU."""
+
+  def __init__(self, input_dataset, map_func, use_inter_op_parallelism=True):
+    """See `Dataset.map()` for details."""
+    super(_MapOnGpuDataset, self).__init__(input_dataset)
+    self._input_dataset = input_dataset
+    self._use_inter_op_parallelism = use_inter_op_parallelism
+
+    wrapped_func = dataset_ops.StructuredFunctionWrapper(
+        map_func,
+        self._transformation_name(),
+        dataset=input_dataset,
+        defun_kwargs={"experimental_ints_on_device": True})
+    self._output_classes = wrapped_func.output_classes
+    self._output_shapes = wrapped_func.output_shapes
+    self._output_types = wrapped_func.output_types
+    self._map_func = wrapped_func.function
+
+  def _as_variant_tensor(self):
+    input_t = self._input_dataset._as_variant_tensor()  # pylint: disable=protected-access
+    return ged_ops.experimental_map_dataset(
+        input_t,
+        self._map_func.captured_inputs,
+        f=self._map_func,
+        use_inter_op_parallelism=self._use_inter_op_parallelism,
+        **dataset_ops.flat_structure(self))
+
+  @property
+  def output_classes(self):
+    return self._output_classes
+
+  @property
+  def output_shapes(self):
+    return self._output_shapes
+
+  @property
+  def output_types(self):
+    return self._output_types
+
+  def _transformation_name(self):
+    return "map_on_gpu()"
+
+
+def map_on_gpu(map_func):
+  """Maps `map_func` across the elements of this dataset.
+
+  NOTE: This is a highly experimental version of `tf.data.Dataset.map` that runs
+  `map_func` on GPU. It must be used after applying the
+  `tf.data.experimental.copy_to_device` transformation with a GPU device
+  argument.
+
+  Args:
+    map_func: A function mapping a nested structure of tensors (having shapes
+      and types defined by `self.output_shapes` and `self.output_types`) to
+      another nested structure of tensors.
+
+  Returns:
+    A `Dataset` transformation function, which can be passed to
+    `tf.data.Dataset.apply`.
+  """
+
+  def _apply_fn(dataset):
+    return _MapOnGpuDataset(dataset, map_func)
+
+  return _apply_fn
diff --git a/tensorflow/python/data/experimental/ops/readers.py b/tensorflow/python/data/experimental/ops/readers.py
index 3b2d0945148..fe601925860 100644
--- a/tensorflow/python/data/experimental/ops/readers.py
+++ b/tensorflow/python/data/experimental/ops/readers.py
@@ -38,6 +38,7 @@ from tensorflow.python.framework import tensor_shape
 from tensorflow.python.lib.io import file_io
 from tensorflow.python.ops import gen_dataset_ops
 from tensorflow.python.ops import gen_experimental_dataset_ops
+from tensorflow.python.ops import io_ops
 from tensorflow.python.platform import gfile
 from tensorflow.python.util.tf_export import tf_export
 
@@ -760,6 +761,7 @@ def make_batched_features_dataset(file_pattern,
     Each `dict` maps feature keys to `Tensor` or `SparseTensor` objects.
 
   Raises:
+    TypeError: If `reader` is a `tf.ReaderBase` subclass.
     ValueError: If `label_key` is not one of the `features` keys.
   """
   # Create dataset of all matching filenames
@@ -768,6 +770,12 @@ def make_batched_features_dataset(file_pattern,
   if shuffle:
     dataset = dataset.shuffle(len(filenames), shuffle_seed)
 
+  if isinstance(reader, type) and issubclass(reader, io_ops.ReaderBase):
+    raise TypeError("The `reader` argument must return a `Dataset` object. "
+                    "`tf.ReaderBase` subclasses are not supported. For "
+                    "example, pass `tf.data.TFRecordDataset` instead of "
+                    "`tf.TFRecordReader`.")
+
   # Read `Example` records from files as tensor objects.
   if reader_args is None:
     reader_args = []
diff --git a/tensorflow/python/data/experimental/ops/stats_aggregator.py b/tensorflow/python/data/experimental/ops/stats_aggregator.py
new file mode 100644
index 00000000000..5274c816a49
--- /dev/null
+++ b/tensorflow/python/data/experimental/ops/stats_aggregator.py
@@ -0,0 +1,84 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""StatsAggregator for aggregating statistics from `tf.data` pipelines."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.ops import gen_dataset_ops
+from tensorflow.python.util.tf_export import tf_export
+
+
+@tf_export("data.experimental.StatsAggregator")
+class StatsAggregator(object):
+  """A stateful resource that aggregates statistics from one or more iterators.
+
+  To record statistics, use one of the custom transformation functions defined
+  in this module when defining your `tf.data.Dataset`. All statistics will be
+  aggregated by the `StatsAggregator` that is associated with a particular
+  iterator (see below). For example, to record the latency of producing each
+  element by iterating over a dataset:
+
+  ```python
+  dataset = ...
+  dataset = dataset.apply(tf.data.experimental.latency_stats("total_bytes"))
+  ```
+
+  To associate a `StatsAggregator` with a `tf.data.Dataset` object, use
+  the following pattern:
+
+  ```python
+  aggregator = tf.data.experimental.StatsAggregator()
+  dataset = ...
+
+  # Apply `StatsOptions` to associate `dataset` with `aggregator`.
+  options = dataset_ops.Options()
+  options.experimental_stats = tf.data.experimental.StatsOptions(aggregator)
+  dataset = dataset.with_options(options)
+  iterator = dataset.make_one_shot_iterator()
+  ```
+
+  To get a protocol buffer summary of the currently aggregated statistics,
+  use the `StatsAggregator.get_summary()` tensor. The easiest way to do this
+  is to add the returned tensor to the `tf.GraphKeys.SUMMARIES` collection,
+  so that the summaries will be included with any existing summaries.
+
+  ```python
+  aggregator = tf.data.experimental.StatsAggregator()
+  # ...
+  stats_summary = aggregator.get_summary()
+  tf.add_to_collection(tf.GraphKeys.SUMMARIES, stats_summary)
+  ```
+
+  Note: This interface is experimental and expected to change. In particular,
+  we expect to add other implementations of `StatsAggregator` that provide
+  different ways of exporting statistics, and add more types of statistics.
+  """
+
+  def __init__(self):
+    """Creates a `StatsAggregator`."""
+    self._resource = gen_dataset_ops.stats_aggregator_handle()
+
+  # TODO(b/116314787): Update this/add support for V2 summary API.
+  def get_summary(self):
+    """Returns a string `tf.Tensor` that summarizes the aggregated statistics.
+
+    The returned tensor will contain a serialized `tf.summary.Summary` protocol
+    buffer, which can be used with the standard TensorBoard logging facilities.
+
+    Returns:
+      A scalar string `tf.Tensor` that summarizes the aggregated statistics.
+    """
+    return gen_dataset_ops.stats_aggregator_summary(self._resource)
diff --git a/tensorflow/python/data/experimental/ops/stats_ops.py b/tensorflow/python/data/experimental/ops/stats_ops.py
index fb93b86b291..ca2f5f2a887 100644
--- a/tensorflow/python/data/experimental/ops/stats_ops.py
+++ b/tensorflow/python/data/experimental/ops/stats_ops.py
@@ -21,110 +21,18 @@ from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import gen_dataset_ops
+from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export("data.experimental.StatsAggregator")
-class StatsAggregator(object):
-  """A stateful resource that aggregates statistics from one or more iterators.
-
-  To record statistics, use one of the custom transformation functions defined
-  in this module when defining your `tf.data.Dataset`. All statistics will be
-  aggregated by the `StatsAggregator` that is associated with a particular
-  iterator (see below). For example, to record the latency of producing each
-  element by iterating over a dataset:
-
-  ```python
-  dataset = ...
-  dataset = dataset.apply(tf.data.experimental.latency_stats("total_bytes"))
-  ```
-
-  To associate a `StatsAggregator` with a `tf.data.Dataset` object, use
-  the following pattern:
-
-  ```python
-  stats_aggregator = stats_ops.StatsAggregator()
-  dataset = ...
-
-  # Apply `set_stats_aggregator` to associate `dataset` with `stats_aggregator`.
-  dataset = dataset.apply(
-      tf.data.experimental.set_stats_aggregator(stats_aggregator))
-  iterator = dataset.make_one_shot_iterator()
-  ```
-
-  To get a protocol buffer summary of the currently aggregated statistics,
-  use the `StatsAggregator.get_summary()` tensor. The easiest way to do this
-  is to add the returned tensor to the `tf.GraphKeys.SUMMARIES` collection,
-  so that the summaries will be included with any existing summaries.
-
-  ```python
-  stats_aggregator = stats_ops.StatsAggregator()
-  # ...
-  stats_summary = stats_aggregator.get_summary()
-  tf.add_to_collection(tf.GraphKeys.SUMMARIES, stats_summary)
-  ```
-
-  Note: This interface is experimental and expected to change. In particular,
-  we expect to add other implementations of `StatsAggregator` that provide
-  different ways of exporting statistics, and add more types of statistics.
-  """
-
-  def __init__(self):
-    """Creates a `StatsAggregator`."""
-    self._resource = gen_dataset_ops.stats_aggregator_handle()
-
-  # TODO(b/116314787): Update this/add support for V2 summary API.
-  def get_summary(self):
-    """Returns a string `tf.Tensor` that summarizes the aggregated statistics.
-
-    The returned tensor will contain a serialized `tf.summary.Summary` protocol
-    buffer, which can be used with the standard TensorBoard logging facilities.
-
-    Returns:
-      A scalar string `tf.Tensor` that summarizes the aggregated statistics.
-    """
-    return gen_dataset_ops.stats_aggregator_summary(self._resource)
-
-
-class _SetStatsAggregatorDataset(dataset_ops.UnaryDataset):
-  """A `Dataset` that acts as an identity, and sets given stats_aggregator."""
-
-  def __init__(self, input_dataset, stats_aggregator, tag, prefix):
-    super(_SetStatsAggregatorDataset, self).__init__(input_dataset)
-    self._input_dataset = input_dataset
-    self._stats_aggregator = stats_aggregator
-    self._tag = tag
-    self._prefix = prefix
-
-  def _as_variant_tensor(self):
-    return gen_dataset_ops.set_stats_aggregator_dataset(
-        self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
-        self._stats_aggregator._resource,  # pylint: disable=protected-access
-        self._tag,
-        self._prefix,
-        **dataset_ops.flat_structure(self))
-
-  @property
-  def output_shapes(self):
-    return self._input_dataset.output_shapes
-
-  @property
-  def output_types(self):
-    return self._input_dataset.output_types
-
-  @property
-  def output_classes(self):
-    return self._input_dataset.output_classes
-
-
-@tf_export("data.experimental.set_stats_aggregator")
-def set_stats_aggregator(stats_aggregator, tag="", counter_prefix=""):
+@deprecation.deprecated(None, "Use `tf.data.experimental.StatsOptions`.")
+def set_stats_aggregator(stats_aggregator, prefix="", counter_prefix=""):
   """Set the given `stats_aggregator` for aggregating the input dataset stats.
 
   Args:
     stats_aggregator: A `tf.contrib.data.StatsAggregator` object.
-    tag: (Optional) String, all statistics recorded for the input `dataset`
-      will have given `tag` prepend with the name.
+    prefix: (Optional) String, all statistics recorded for the input `dataset`
+      will have given `prefix` prepend with the name.
     counter_prefix: (Optional) String, all statistics recorded as `counters`
       will have the given `prefix` for the counter. Defaults to "/tensorflow".
 
@@ -134,8 +42,8 @@ def set_stats_aggregator(stats_aggregator, tag="", counter_prefix=""):
   """
 
   def _apply_fn(dataset):
-    return _SetStatsAggregatorDataset(dataset, stats_aggregator, tag,
-                                      counter_prefix)
+    return dataset_ops._SetStatsAggregatorDataset(  # pylint: disable=protected-access
+        dataset, stats_aggregator, prefix, counter_prefix)
 
   return _apply_fn
 
diff --git a/tensorflow/python/data/experimental/ops/stats_options.py b/tensorflow/python/data/experimental/ops/stats_options.py
new file mode 100644
index 00000000000..c088d3d8881
--- /dev/null
+++ b/tensorflow/python/data/experimental/ops/stats_options.py
@@ -0,0 +1,103 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""StatsOptions to configure stats aggregation options for `tf.data` pipelines.
+
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.data.experimental.ops import stats_aggregator
+from tensorflow.python.util.tf_export import tf_export
+
+
+@tf_export("data.experimental.StatsOptions")
+class StatsOptions(object):
+  """Represents options for collecting dataset stats using `StatsAggregator`.
+
+  To apply `StatsOptions` with a `tf.data.Dataset` object, use the following
+  pattern:
+
+  ```python
+  aggretator = tf.data.experimental.StatsAggregator()
+
+  options = dataset_ops.Options()
+  options.experimental_stats = tf.data.experimental.StatsOptions()
+  options.experimental_stats.aggregator = aggregator
+  dataset = dataset.with_options(options)
+
+  iterator = dataset.make_one_shot_iterator()
+  ```
+
+  Note: a `StatsAggregator` object can be attached either duing construction or
+  can be provided later like in above example.
+
+  ```python
+  aggretator = tf.data.experimental.StatsAggregator()
+  # attach aggregator during construction
+  options.experimental_stats = tf.data.experimental.StatsOptions(aggregator)
+  .....
+  ```
+  """
+
+  for _name, _ty, _default, _docstring in [
+      ("aggregator", stats_aggregator.StatsAggregator, None,
+       "Associate the given statistics options with the dataset pipeline."),
+      ("prefix", str, "",
+       "Prefix to prepend all statistics recorded for the input `dataset` with."
+      ),
+      ("counter_prefix", str, "",
+       "Prefix for the statistics recorded as counter."),
+      ("latency_all_edges", bool, True,
+       "Whether to add latency measurements on all edges."),
+  ]:
+
+    def _make_getter(name):  # pylint: disable=no-self-argument
+
+      def getter(self):
+        return getattr(self, "_" + name)
+
+      return getter
+
+    def _make_setter(name, ty):  # pylint: disable=no-self-argument
+
+      def setter(self, value):
+        if not isinstance(value, ty):
+          raise TypeError(
+              "Attempting to set the option %s to incompatible value: %r when "
+              "it expects  %r" % (name, value, ty))
+        setattr(self, "_" + name, value)
+
+      return setter
+
+    vars()["_" + _name] = _default
+    vars()[_name] = property(
+        _make_getter(_name), _make_setter(_name, _ty), _default, _docstring)
+
+  def __init__(self, aggregator=None):
+    if aggregator:
+      self.aggregator = aggregator
+
+  def __eq__(self, other):
+    if isinstance(other, self.__class__):
+      return self.__dict__ == other.__dict__
+    else:
+      return False
+
+  def __ne__(self, other):
+    return not self.__eq__(other)
+
+  def __str__(self):
+    return str(self.__dict__)
diff --git a/tensorflow/python/data/kernel_tests/BUILD b/tensorflow/python/data/kernel_tests/BUILD
index 6219d1491ff..21eed2b070a 100644
--- a/tensorflow/python/data/kernel_tests/BUILD
+++ b/tensorflow/python/data/kernel_tests/BUILD
@@ -173,6 +173,20 @@ tf_py_test(
     ],
 )
 
+tf_py_test(
+    name = "inputs_test",
+    size = "small",
+    srcs = ["inputs_test.py"],
+    additional_deps = [
+        ":test_base",
+        "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python/data/ops:dataset_ops",
+    ],
+)
+
 tf_py_test(
     name = "interleave_dataset_op_test",
     size = "small",
diff --git a/tensorflow/python/data/kernel_tests/dataset_ops_test.py b/tensorflow/python/data/kernel_tests/dataset_ops_test.py
index 63d2be4371c..a5324af4d0c 100644
--- a/tensorflow/python/data/kernel_tests/dataset_ops_test.py
+++ b/tensorflow/python/data/kernel_tests/dataset_ops_test.py
@@ -226,7 +226,8 @@ class DatasetOpsTest(test_base.DatasetTestBase, parameterized.TestCase):
     ds = dataset_ops.Dataset.range(0).with_options(options1).with_options(
         options2)
     self.assertTrue(ds.options().experimental_autotune)
-    self.assertFalse(ds.options().experimental_filter_fusion)
+    # Explicitly check that flag is False since assertFalse allows None
+    self.assertIs(ds.options().experimental_filter_fusion, False)
 
   def testOptionsTwiceDifferentError(self):
     options1 = dataset_ops.Options()
@@ -237,6 +238,17 @@ class DatasetOpsTest(test_base.DatasetTestBase, parameterized.TestCase):
                                  "Cannot merge incompatible values of option"):
       dataset_ops.Dataset.range(0).with_options(options1).with_options(options2)
 
+  def testOptionsMergeOptionsFromMultipleInputs(self):
+    options1 = dataset_ops.Options()
+    options1.experimental_autotune = True
+    options2 = dataset_ops.Options()
+    options2.experimental_filter_fusion = True
+    ds = dataset_ops.Dataset.zip(
+        (dataset_ops.Dataset.range(0).with_options(options1),
+         dataset_ops.Dataset.range(0).with_options(options2)))
+    self.assertTrue(ds.options().experimental_autotune)
+    self.assertTrue(ds.options().experimental_filter_fusion)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/kernel_tests/range_dataset_op_test.py b/tensorflow/python/data/kernel_tests/range_dataset_op_test.py
index b7e2a5f615e..b71e6b2ea43 100644
--- a/tensorflow/python/data/kernel_tests/range_dataset_op_test.py
+++ b/tensorflow/python/data/kernel_tests/range_dataset_op_test.py
@@ -26,7 +26,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
-from tensorflow.python.ops import array_ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import gen_dataset_ops
 from tensorflow.python.ops import io_ops
 from tensorflow.python.ops import parsing_ops
@@ -35,8 +35,52 @@ from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class RangeDatasetTest(test_base.DatasetTestBase):
 
+  def testStop(self):
+    dataset = dataset_ops.Dataset.range(5)
+    self.assertDatasetProduces(dataset, expected_output=range(5))
+
+  def testStartStop(self):
+    start, stop = 2, 5
+    dataset = dataset_ops.Dataset.range(start, stop)
+    self.assertDatasetProduces(dataset, expected_output=range(2, 5))
+
+  def testStartStopStep(self):
+    start, stop, step = 2, 10, 2
+    dataset = dataset_ops.Dataset.range(start, stop, step)
+    self.assertDatasetProduces(dataset, expected_output=range(2, 10, 2))
+
+  def testZeroStep(self):
+    start, stop, step = 2, 10, 0
+    dataset = dataset_ops.Dataset.range(start, stop, step)
+    self.assertDatasetProduces(
+        dataset, expected_err=(errors.InvalidArgumentError, ""))
+
+  def testNegativeStep(self):
+    start, stop, step = 2, 10, -1
+    dataset = dataset_ops.Dataset.range(start, stop, step)
+    self.assertDatasetProduces(dataset, expected_output=range(2, 10, -1))
+
+  def testStopLessThanStart(self):
+    start, stop = 10, 2
+    dataset = dataset_ops.Dataset.range(start, stop)
+    self.assertDatasetProduces(dataset, expected_output=range(10, 2))
+
+  def testStopLessThanStartWithPositiveStep(self):
+    start, stop, step = 10, 2, 2
+    dataset = dataset_ops.Dataset.range(start, stop, step)
+    self.assertDatasetProduces(dataset, expected_output=range(10, 2, 2))
+
+  def testStopLessThanStartWithNegativeStep(self):
+    start, stop, step = 10, 2, -1
+    dataset = dataset_ops.Dataset.range(start, stop, step)
+    self.assertDatasetProduces(dataset, expected_output=range(10, 2, -1))
+
+
+class ExperimentalCheckpointDatasetTest(test_base.DatasetTestBase):
+
   def tearDown(self):
     # Remove all checkpoint files.
     prefix = self._iterator_checkpoint_prefix()
@@ -44,131 +88,6 @@ class RangeDatasetTest(test_base.DatasetTestBase):
     files = gfile.Glob(pattern)
     map(gfile.Remove, files)
 
-  def testStop(self):
-    stop = array_ops.placeholder(dtypes.int64, shape=[])
-    iterator = dataset_ops.Dataset.range(stop).make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op, feed_dict={stop: 5})
-      for i in range(5):
-        self.assertEqual(i, sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testStartStop(self):
-    start = array_ops.placeholder(dtypes.int64, shape=[])
-    stop = array_ops.placeholder(dtypes.int64, shape=[])
-    iterator = dataset_ops.Dataset.range(start,
-                                         stop).make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op, feed_dict={start: 2, stop: 5})
-      for i in range(2, 5):
-        self.assertEqual(i, sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testStartStopStep(self):
-    start = array_ops.placeholder(dtypes.int64, shape=[])
-    stop = array_ops.placeholder(dtypes.int64, shape=[])
-    step = array_ops.placeholder(dtypes.int64, shape=[])
-    iterator = dataset_ops.Dataset.range(start, stop,
-                                         step).make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op, feed_dict={start: 2, stop: 10, step: 2})
-      for i in range(2, 10, 2):
-        self.assertEqual(i, sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testZeroStep(self):
-    start = array_ops.placeholder(dtypes.int64, shape=[])
-    stop = array_ops.placeholder(dtypes.int64, shape=[])
-    step = array_ops.placeholder(dtypes.int64, shape=[])
-    iterator = dataset_ops.Dataset.range(start, stop,
-                                         step).make_initializable_iterator()
-    init_op = iterator.initializer
-
-    with self.cached_session() as sess:
-      with self.assertRaises(errors.InvalidArgumentError):
-        sess.run(init_op, feed_dict={start: 2, stop: 10, step: 0})
-
-  def testNegativeStep(self):
-    start = array_ops.placeholder(dtypes.int64, shape=[])
-    stop = array_ops.placeholder(dtypes.int64, shape=[])
-    step = array_ops.placeholder(dtypes.int64, shape=[])
-    iterator = dataset_ops.Dataset.range(start, stop,
-                                         step).make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op, feed_dict={start: 2, stop: 10, step: -1})
-      # This for loop is a no-op but will ensure that the implementation is
-      # consistent with range if it ever changes.
-      for i in range(2, 10, -1):
-        self.assertEqual(i, sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testStopLessThanStart(self):
-    start = array_ops.placeholder(dtypes.int64, shape=[])
-    stop = array_ops.placeholder(dtypes.int64, shape=[])
-    iterator = dataset_ops.Dataset.range(start,
-                                         stop).make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op, feed_dict={start: 10, stop: 2})
-      # This for loop is a no-op but will ensure that the implementation is
-      # consistent with range if it ever changes.
-      for i in range(10, 2):
-        self.assertEqual(i, sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testStopLessThanStartWithPositiveStep(self):
-    start = array_ops.placeholder(dtypes.int64, shape=[])
-    stop = array_ops.placeholder(dtypes.int64, shape=[])
-    step = array_ops.placeholder(dtypes.int64, shape=[])
-    iterator = dataset_ops.Dataset.range(start, stop,
-                                         step).make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op, feed_dict={start: 10, stop: 2, step: 2})
-      # This for loop is a no-op but will ensure that the implementation is
-      # consistent with range if it ever changes.
-      for i in range(10, 2, 2):
-        self.assertEqual(i, sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testStopLessThanStartWithNegativeStep(self):
-    start = array_ops.placeholder(dtypes.int64, shape=[])
-    stop = array_ops.placeholder(dtypes.int64, shape=[])
-    step = array_ops.placeholder(dtypes.int64, shape=[])
-    iterator = dataset_ops.Dataset.range(start, stop,
-                                         step).make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op, feed_dict={start: 10, stop: 2, step: -1})
-      for i in range(10, 2, -1):
-        self.assertEqual(i, sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
   def _iterator_checkpoint_prefix(self):
     return os.path.join(self.get_temp_dir(), "iterator")
 
diff --git a/tensorflow/python/data/kernel_tests/reader_dataset_ops_test.py b/tensorflow/python/data/kernel_tests/reader_dataset_ops_test.py
index aef2dd1d9c6..4fef4f30bf9 100644
--- a/tensorflow/python/data/kernel_tests/reader_dataset_ops_test.py
+++ b/tensorflow/python/data/kernel_tests/reader_dataset_ops_test.py
@@ -213,27 +213,47 @@ class FixedLengthRecordReaderTest(test_base.DatasetTestBase):
   def _record(self, f, r):
     return compat.as_bytes(str(f * 2 + r) * self._record_bytes)
 
-  def _createFiles(self):
+  def _createFiles(self, compression_type=None):
     filenames = []
     for i in range(self._num_files):
       fn = os.path.join(self.get_temp_dir(), "fixed_length_record.%d.txt" % i)
       filenames.append(fn)
-      with open(fn, "wb") as f:
-        f.write(b"H" * self._header_bytes)
-        for j in range(self._num_records):
-          f.write(self._record(i, j))
-        f.write(b"F" * self._footer_bytes)
+
+      contents = []
+      contents.append(b"H" * self._header_bytes)
+      for j in range(self._num_records):
+        contents.append(self._record(i, j))
+      contents.append(b"F" * self._footer_bytes)
+      contents = b"".join(contents)
+
+      if not compression_type:
+        with open(fn, "wb") as f:
+          f.write(contents)
+      elif compression_type == "GZIP":
+        with gzip.GzipFile(fn, "wb") as f:
+          f.write(contents)
+      elif compression_type == "ZLIB":
+        contents = zlib.compress(contents)
+        with open(fn, "wb") as f:
+          f.write(contents)
+      else:
+        raise ValueError("Unsupported compression_type", compression_type)
+
     return filenames
 
-  def testFixedLengthRecordDataset(self):
-    test_filenames = self._createFiles()
+  def _testFixedLengthRecordDataset(self, compression_type=None):
+    test_filenames = self._createFiles(compression_type=compression_type)
     filenames = array_ops.placeholder(dtypes.string, shape=[None])
     num_epochs = array_ops.placeholder(dtypes.int64, shape=[])
     batch_size = array_ops.placeholder(dtypes.int64, shape=[])
 
-    repeat_dataset = (readers.FixedLengthRecordDataset(
-        filenames, self._record_bytes, self._header_bytes, self._footer_bytes)
-                      .repeat(num_epochs))
+    repeat_dataset = (
+        readers.FixedLengthRecordDataset(
+            filenames,
+            self._record_bytes,
+            self._header_bytes,
+            self._footer_bytes,
+            compression_type=compression_type).repeat(num_epochs))
     batch_dataset = repeat_dataset.batch(batch_size)
 
     iterator = iterator_ops.Iterator.from_structure(batch_dataset.output_types)
@@ -293,6 +313,15 @@ class FixedLengthRecordReaderTest(test_base.DatasetTestBase):
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
+  def testFixedLengthRecordDatasetNoCompression(self):
+    self._testFixedLengthRecordDataset()
+
+  def testFixedLengthRecordDatasetGzipCompression(self):
+    self._testFixedLengthRecordDataset(compression_type="GZIP")
+
+  def testFixedLengthRecordDatasetZlibCompression(self):
+    self._testFixedLengthRecordDataset(compression_type="ZLIB")
+
   def testFixedLengthRecordDatasetBuffering(self):
     test_filenames = self._createFiles()
     dataset = readers.FixedLengthRecordDataset(
diff --git a/tensorflow/python/data/kernel_tests/test_base.py b/tensorflow/python/data/kernel_tests/test_base.py
index 219a25a615e..edb3eff3c17 100644
--- a/tensorflow/python/data/kernel_tests/test_base.py
+++ b/tensorflow/python/data/kernel_tests/test_base.py
@@ -62,6 +62,37 @@ class DatasetTestBase(test.TestCase):
       nxt = it.get_next()
       return lambda: nxt
 
+  def _compare_output_to_expected(self, result_values, expected_values):
+    for i in range(len(result_values)):
+      if sparse_tensor.is_sparse(result_values[i]):
+        self.assertSparseValuesEqual(result_values[i], expected_values[i])
+      else:
+        self.assertAllEqual(result_values[i], expected_values[i])
+
+  def assertDatasetProduces(self,
+                            input_dataset,
+                            expected_output=None,
+                            expected_err=None,
+                            create_iterator_twice=True):
+
+    if expected_err:
+      with self.assertRaisesWithPredicateMatch(expected_err[0],
+                                               expected_err[1]):
+        get_next = self.getNext(input_dataset)
+        self.evaluate(get_next())
+      return
+    repeated = 2 if create_iterator_twice else 1
+    for _ in range(repeated):
+      get_next = self.getNext(input_dataset)
+      result = []
+      for _ in range(len(expected_output)):
+        result.append(self.evaluate(get_next()))
+      self._compare_output_to_expected(result, expected_output)
+      with self.assertRaises(errors.OutOfRangeError):
+        self.evaluate(get_next())
+      with self.assertRaises(errors.OutOfRangeError):
+        self.evaluate(get_next())
+
   def assertDatasetsEqual(self, dataset1, dataset2):
     """Checks that datasets are equal. Supports both graph and eager mode."""
     self.assertEqual(dataset1.output_types, dataset2.output_types)
@@ -83,9 +114,7 @@ class DatasetTestBase(test.TestCase):
       op2 = nest.flatten(op2)
       assert len(op1) == len(op2)
       for i in range(len(op1)):
-        if isinstance(
-            op1[i],
-            (sparse_tensor.SparseTensor, sparse_tensor.SparseTensorValue)):
+        if sparse_tensor.is_sparse(op1[i]):
           self.assertSparseValuesEqual(op1[i], op2[i])
         elif flattened_types[i] == dtypes.string:
           self.assertAllEqual(op1[i], op2[i])
diff --git a/tensorflow/python/data/ops/BUILD b/tensorflow/python/data/ops/BUILD
index 5e636965a66..18edc0872d7 100644
--- a/tensorflow/python/data/ops/BUILD
+++ b/tensorflow/python/data/ops/BUILD
@@ -25,6 +25,7 @@ py_library(
         "//tensorflow/python:tensor_shape",
         "//tensorflow/python:tensor_util",
         "//tensorflow/python:util",
+        "//tensorflow/python/data/experimental/ops:stats_options",
         "//tensorflow/python/data/util:nest",
         "//tensorflow/python/data/util:random_seed",
         "//tensorflow/python/data/util:sparse",
diff --git a/tensorflow/python/data/ops/dataset_ops.py b/tensorflow/python/data/ops/dataset_ops.py
index f49ebd0e55e..3836a68e7d4 100644
--- a/tensorflow/python/data/ops/dataset_ops.py
+++ b/tensorflow/python/data/ops/dataset_ops.py
@@ -25,6 +25,7 @@ import numpy as np
 import six
 
 from tensorflow.python.compat import compat
+from tensorflow.python.data.experimental.ops import stats_options
 from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.data.util import nest
 from tensorflow.python.data.util import random_seed
@@ -60,6 +61,7 @@ class Dataset(object):
   collection of elements (nested structures of tensors) and a "logical
   plan" of transformations that act on those elements.
   """
+
   def __init__(self):
     pass
 
@@ -88,18 +90,21 @@ class Dataset(object):
     raise NotImplementedError("Dataset._inputs")
 
   def options(self):
-    """Returns the options for this dataset.
+    """Returns the options for this dataset and its inputs.
 
     Returns:
       A `tf.data.Options` object representing the dataset options.
     """
+    options = Options()
     for input_dataset in self._inputs():
-      options = input_dataset.options()
-      if options is not None:
-        return options
-    return Options()
+      input_options = input_dataset.options()
+      if input_options is not None:
+        options = options.merge(input_options)
+    return options
 
   def _apply_options(self):
+    """Apply options, such as optimization configuration, to the dataset."""
+
     dataset = self
     options = self.options()
     static_optimizations = options._static_optimizations()  # pylint: disable=protected-access
@@ -107,6 +112,11 @@ class Dataset(object):
       dataset = _OptimizeDataset(dataset, static_optimizations)
     if options.experimental_autotune is not False:
       dataset = _ModelDataset(dataset)
+    if options.experimental_stats and options.experimental_stats.aggregator:  # pylint: disable=line-too-long
+      dataset = _SetStatsAggregatorDataset(  # pylint: disable=protected-access
+          dataset, options.experimental_stats.aggregator,
+          options.experimental_stats.prefix,
+          options.experimental_stats.counter_prefix)
     return dataset
 
   def make_initializable_iterator(self, shared_name=None):
@@ -192,6 +202,7 @@ class Dataset(object):
     # a 0-argument function.
     @function.Defun(capture_by_value=True)
     def _make_dataset():
+      """Factory function for a dataset."""
       # NOTE(mrry): `Defun` does not capture the graph-level seed from the
       # enclosing graph, so if a graph-level seed is present we set the local
       # graph seed based on a combination of the graph- and op-level seeds.
@@ -1410,8 +1421,8 @@ class Options(object):
       ("experimental_hoist_random_uniform", bool,
        "Whether to hoist `tf.random_uniform()` ops out of map transformations."
       ),
-      ("experimental_latency_all_edges", bool,
-       "Whether to add latency measurements on all edges."),
+      ("experimental_stats", stats_options.StatsOptions,
+       "Associate the given statistics options with the dataset pipeline."),
       ("experimental_map_and_batch_fusion", bool,
        "Whether to fuse map and batch transformations."),
       ("experimental_map_and_filter_fusion", bool,
@@ -1441,8 +1452,8 @@ class Options(object):
       def setter(self, value):
         if not isinstance(value, ty):
           raise TypeError(
-              "Attempting to set the option %s to incompatible value: %r" %
-              (name, value))
+              "Attempting to set the option %s to incompatible value: %r when "
+              "it expects  %r" % (name, value, ty))
         setattr(self, "_" + name, value)
 
       return setter
@@ -1466,10 +1477,15 @@ class Options(object):
   def _static_optimizations(self):
     """Produces the list of enabled static optimizations."""
     experimental_optimizations = [
-        "filter_fusion", "hoist_random_uniform", "latency_all_edges",
-        "map_and_batch_fusion", "map_and_filter_fusion", "map_fusion",
-        "map_parallelization", "map_vectorization", "noop_elimination",
-        "shuffle_and_repeat_fusion"
+        "filter_fusion",
+        "hoist_random_uniform",
+        "map_and_batch_fusion",
+        "map_and_filter_fusion",
+        "map_fusion",
+        "map_parallelization",
+        "map_vectorization",
+        "noop_elimination",
+        "shuffle_and_repeat_fusion",
     ]
     result = []
     for exp_opt in experimental_optimizations:
@@ -1480,6 +1496,10 @@ class Options(object):
       result.append("make_numa_aware")
     if getattr(self, "experimental_deterministic") is False:
       result.append("make_sloppy")
+    experimental_stats_options = getattr(self, "experimental_stats")
+    if experimental_stats_options and getattr(experimental_stats_options,
+                                              "latency_all_edges"):
+      result.append("latency_all_edges")
     return result
 
   def merge(self, options):
@@ -1505,7 +1525,6 @@ class Options(object):
           "experimental_deterministic",
           "experimental_filter_fusion",
           "experimental_hoist_random_uniform",
-          "experimental_latency_all_edges",
           "experimental_map_and_batch_fusion",
           "experimental_map_and_filter_fusion",
           "experimental_map_fusion",
@@ -1514,6 +1533,7 @@ class Options(object):
           "experimental_noop_elimination",
           "experimental_numa_aware",
           "experimental_shuffle_and_repeat_fusion",
+          "experimental_stats",
       ]:
         this = getattr(result, name)
         that = getattr(other, name)
@@ -1759,7 +1779,8 @@ class StructuredFunctionWrapper(object):
                input_shapes=None,
                input_types=None,
                add_to_graph=True,
-               experimental_nested_dataset_support=False):
+               experimental_nested_dataset_support=False,
+               defun_kwargs=None):
     """Creates a new `StructuredFunctionWrapper` for the given function.
 
     Args:
@@ -1780,6 +1801,9 @@ class StructuredFunctionWrapper(object):
         default graph.
       experimental_nested_dataset_support: (Optional.) If `True`, the function
         will support `tf.data.Dataset` objects as arguments and return values.
+      defun_kwargs: (Optional.) A dictionary mapping string argument names to
+        values. If supplied, will be passed to `function.Defun()` as keyword
+        arguments.
 
     Raises:
       ValueError: If an invalid combination of `dataset`, `input_classes`,
@@ -1814,7 +1838,11 @@ class StructuredFunctionWrapper(object):
     # TODO(b/110122868): Enable this support for all `tf.data` functions.
     self._nested_dataset_support = experimental_nested_dataset_support
 
-    @function.Defun(*self._defun_args(), func_name=self._func_name)
+    if defun_kwargs is None:
+      defun_kwargs = {}
+
+    @function.Defun(
+        *self._defun_args(), func_name=self._func_name, **defun_kwargs)
     def tf_data_structured_function_wrapper(*args):
       """Wrapper for passing nested structures to and from tf.data functions."""
       flat_args = []
@@ -3067,3 +3095,34 @@ class _OptimizeDataset(UnaryDataset):
   @property
   def output_types(self):
     return self._input_dataset.output_types
+
+
+class _SetStatsAggregatorDataset(UnaryDataset):
+  """A `Dataset` that acts as an identity, and sets stats aggregator."""
+
+  def __init__(self, input_dataset, aggregator, prefix, counter_prefix):
+    super(_SetStatsAggregatorDataset, self).__init__(input_dataset)
+    self._input_dataset = input_dataset
+    self._stats_aggregator = aggregator
+    self._prefix = prefix
+    self._counter_prefix = counter_prefix
+
+  def _as_variant_tensor(self):
+    return gen_dataset_ops.set_stats_aggregator_dataset(
+        self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
+        self._stats_aggregator._resource,  # pylint: disable=protected-access
+        self._prefix,
+        self._counter_prefix,
+        **flat_structure(self))
+
+  @property
+  def output_shapes(self):
+    return self._input_dataset.output_shapes
+
+  @property
+  def output_types(self):
+    return self._input_dataset.output_types
+
+  @property
+  def output_classes(self):
+    return self._input_dataset.output_classes
diff --git a/tensorflow/python/data/ops/multi_device_iterator_ops.py b/tensorflow/python/data/ops/multi_device_iterator_ops.py
index 51dd8519e92..0f9add6461a 100644
--- a/tensorflow/python/data/ops/multi_device_iterator_ops.py
+++ b/tensorflow/python/data/ops/multi_device_iterator_ops.py
@@ -55,7 +55,8 @@ class _PerDeviceGenerator(dataset_ops.Dataset):
     def _init_func():
       return multi_device_iterator_string_handle
 
-    init_func_concrete = _init_func.get_concrete_function()
+    init_func_concrete = _init_func._get_concrete_function_internal()  # pylint: disable=protected-access
+
     @function.defun()
     def _remote_init_func():
       return functional_ops.remote_call(
@@ -64,7 +65,7 @@ class _PerDeviceGenerator(dataset_ops.Dataset):
           Tout=[dtypes.string],
           f=init_func_concrete)
 
-    self._init_func = _remote_init_func.get_concrete_function()
+    self._init_func = _remote_init_func._get_concrete_function_internal()  # pylint: disable=protected-access
     self._init_captured_args = self._init_func.captured_inputs
 
     @function.defun(input_signature=[tensor_spec.TensorSpec([], dtypes.string)])
@@ -81,7 +82,8 @@ class _PerDeviceGenerator(dataset_ops.Dataset):
           output_types=self._flat_output_types,
           output_shapes=self._flat_output_shapes)
 
-    next_func_concrete = _next_func.get_concrete_function()
+    next_func_concrete = _next_func._get_concrete_function_internal()  # pylint: disable=protected-access
+
     @function.defun_with_attributes(
         input_signature=[tensor_spec.TensorSpec([], dtypes.string)],
         attributes={"experimental_ints_on_device": True})
@@ -93,14 +95,15 @@ class _PerDeviceGenerator(dataset_ops.Dataset):
           Tout=self._flat_output_types,
           f=next_func_concrete)
 
-    self._next_func = _remote_next_func.get_concrete_function()
+    self._next_func = _remote_next_func._get_concrete_function_internal()  # pylint: disable=protected-access
     self._next_captured_args = self._next_func.captured_inputs
 
     @function.defun(input_signature=[tensor_spec.TensorSpec([], dtypes.string)])
     def _finalize_func(unused_string_handle):
       return array_ops.constant(0, dtypes.int64)
 
-    finalize_func_concrete = _finalize_func.get_concrete_function()
+    finalize_func_concrete = _finalize_func._get_concrete_function_internal()  # pylint: disable=protected-access
+
     @function.defun(input_signature=[tensor_spec.TensorSpec([], dtypes.string)])
     def _remote_finalize_func(string_handle):
       return functional_ops.remote_call(
@@ -110,7 +113,8 @@ class _PerDeviceGenerator(dataset_ops.Dataset):
           Tout=[dtypes.int64],
           f=finalize_func_concrete)
 
-    self._finalize_func = _remote_finalize_func.get_concrete_function()
+    self._finalize_func = _remote_finalize_func._get_concrete_function_internal(  # pylint: disable=protected-access
+    )
     self._finalize_captured_args = self._finalize_func.captured_inputs
 
   def _as_variant_tensor(self):
diff --git a/tensorflow/python/data/ops/readers.py b/tensorflow/python/data/ops/readers.py
index d08da6704ca..7e165a052d7 100644
--- a/tensorflow/python/data/ops/readers.py
+++ b/tensorflow/python/data/ops/readers.py
@@ -17,6 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.compat import compat
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.util import convert
 from tensorflow.python.framework import dtypes
@@ -32,7 +33,7 @@ _DEFAULT_READER_BUFFER_SIZE_BYTES = 256 * 1024  # 256 KB
 
 
 @tf_export("data.TextLineDataset")
-class TextLineDataset(dataset_ops.Dataset):
+class TextLineDataset(dataset_ops.DatasetSource):
   """A `Dataset` comprising lines from one or more text files."""
 
   def __init__(self, filenames, compression_type=None, buffer_size=None):
@@ -61,9 +62,6 @@ class TextLineDataset(dataset_ops.Dataset):
     return gen_dataset_ops.text_line_dataset(
         self._filenames, self._compression_type, self._buffer_size)
 
-  def _inputs(self):
-    return []
-
   @property
   def output_classes(self):
     return ops.Tensor
@@ -77,7 +75,7 @@ class TextLineDataset(dataset_ops.Dataset):
     return dtypes.string
 
 
-class _TFRecordDataset(dataset_ops.Dataset):
+class _TFRecordDataset(dataset_ops.DatasetSource):
   """A `Dataset` comprising records from one or more TFRecord files."""
 
   def __init__(self, filenames, compression_type=None, buffer_size=None):
@@ -108,9 +106,6 @@ class _TFRecordDataset(dataset_ops.Dataset):
     return gen_dataset_ops.tf_record_dataset(
         self._filenames, self._compression_type, self._buffer_size)
 
-  def _inputs(self):
-    return []
-
   @property
   def output_classes(self):
     return ops.Tensor
@@ -247,7 +242,7 @@ class TFRecordDataset(dataset_ops.Dataset):
 
 
 @tf_export("data.FixedLengthRecordDataset")
-class FixedLengthRecordDataset(dataset_ops.Dataset):
+class FixedLengthRecordDataset(dataset_ops.DatasetSource):
   """A `Dataset` of fixed-length records from one or more binary files."""
 
   def __init__(self,
@@ -255,7 +250,8 @@ class FixedLengthRecordDataset(dataset_ops.Dataset):
                record_bytes,
                header_bytes=None,
                footer_bytes=None,
-               buffer_size=None):
+               buffer_size=None,
+               compression_type=None):
     """Creates a `FixedLengthRecordDataset`.
 
     Args:
@@ -268,6 +264,8 @@ class FixedLengthRecordDataset(dataset_ops.Dataset):
         bytes to ignore at the end of a file.
       buffer_size: (Optional.) A `tf.int64` scalar representing the number of
         bytes to buffer when reading.
+      compression_type: (Optional.) A `tf.string` scalar evaluating to one of
+        `""` (no compression), `"ZLIB"`, or `"GZIP"`.
     """
     super(FixedLengthRecordDataset, self).__init__()
     self._filenames = ops.convert_to_tensor(
@@ -281,14 +279,23 @@ class FixedLengthRecordDataset(dataset_ops.Dataset):
         "footer_bytes", footer_bytes)
     self._buffer_size = convert.optional_param_to_tensor(
         "buffer_size", buffer_size, _DEFAULT_READER_BUFFER_SIZE_BYTES)
+    self._compression_type = convert.optional_param_to_tensor(
+        "compression_type",
+        compression_type,
+        argument_default="",
+        argument_dtype=dtypes.string)
 
   def _as_variant_tensor(self):
-    return gen_dataset_ops.fixed_length_record_dataset(
-        self._filenames, self._header_bytes, self._record_bytes,
-        self._footer_bytes, self._buffer_size)
+    if (self._compression_type is not None or
+        compat.forward_compatible(2018, 11, 30)):
+      return gen_dataset_ops.fixed_length_record_dataset_v2(
+          self._filenames, self._header_bytes, self._record_bytes,
+          self._footer_bytes, self._buffer_size, self._compression_type)
+    else:
+      return gen_dataset_ops.fixed_length_record_dataset(
+          self._filenames, self._header_bytes, self._record_bytes,
+          self._footer_bytes, self._buffer_size)
 
-  def _inputs(self):
-    return []
 
   @property
   def output_classes(self):
diff --git a/tensorflow/python/eager/BUILD b/tensorflow/python/eager/BUILD
index 99e3dc0de18..362e8e3b832 100644
--- a/tensorflow/python/eager/BUILD
+++ b/tensorflow/python/eager/BUILD
@@ -18,7 +18,7 @@ cc_library(
         "pywrap_tfe.h",
     ],
     visibility = [
-        "//learning/deepmind/courier:__pkg__",
+        "//learning/deepmind/courier:__subpackages__",
         "//tensorflow:internal",
     ],
     deps = [
@@ -114,9 +114,11 @@ cuda_py_test(
         ":backprop",
         ":context",
         ":test",
+        "//third_party/py/numpy",
         "//tensorflow/python:embedding_ops",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:layers",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:nn_ops",
         "//tensorflow/python:resource_variable_ops",
@@ -143,6 +145,35 @@ cuda_py_test(
     ],
 )
 
+cuda_py_test(
+    name = "function_argument_naming_test",
+    size = "medium",
+    srcs = ["function_argument_naming_test.py"],
+    additional_deps = [
+        ":backprop",
+        ":def_function",
+        ":function",
+        ":test",
+        "@absl_py//absl/testing:parameterized",
+        "//tensorflow/python:math_ops",
+    ],
+)
+
+cuda_py_test(
+    name = "function_defun_collection_test",
+    size = "medium",
+    srcs = ["function_defun_collection_test.py"],
+    additional_deps = [
+        ":backprop",
+        ":def_function",
+        ":function",
+        ":test",
+        "@absl_py//absl/testing:parameterized",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:resource_variable_ops",
+    ],
+)
+
 cuda_py_test(
     name = "function_test",
     size = "medium",
@@ -152,7 +183,6 @@ cuda_py_test(
         ":context",
         ":def_function",
         ":function",
-        ":tape",
         ":test",
         "@absl_py//absl/testing:parameterized",
         "//tensorflow/python:test_ops",
@@ -408,16 +438,29 @@ py_library(
     deps = [
         ":context",
         ":function",
+        ":lift_to_graph",
         "//tensorflow/python:cond_v2",  # TODO(b/118513001): Imported via control_flow_ops; remove.
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:util",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:while_v2",  # TODO(b/118513001): Imported via control_flow_ops; remove.
         "//tensorflow/python/training/checkpointable:base",
     ],
 )
 
+py_library(
+    name = "lift_to_graph",
+    srcs = ["lift_to_graph.py"],
+    srcs_version = "PY2AND3",
+    visibility = ["//tensorflow:internal"],
+    deps = [
+        ":context",
+        "//tensorflow/python:framework_ops",
+    ],
+)
+
 py_test(
     name = "def_function_test",
     srcs = ["def_function_test.py"],
@@ -438,6 +481,7 @@ py_library(
     deps = [
         ":context",
         ":function",
+        ":lift_to_graph",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:template",
         "//tensorflow/python:variable_scope",
diff --git a/tensorflow/python/eager/backprop_test.py b/tensorflow/python/eager/backprop_test.py
index 31ba6ad9b8d..274d5320df0 100644
--- a/tensorflow/python/eager/backprop_test.py
+++ b/tensorflow/python/eager/backprop_test.py
@@ -30,6 +30,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_util
+from tensorflow.python.layers.pooling import max_pooling3d
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import custom_gradient
@@ -1140,5 +1141,29 @@ class BackpropTest(test.TestCase):
       g = f(c)
     self.assertAllEqual(self.evaluate(t.gradient(g, c)), 4.0)
 
+  @test_util.run_in_graph_and_eager_modes
+  def testMaxPooling3DGradient(self):
+
+    def forward(a):
+      r = max_pooling3d(a, pool_size=pool_size, strides=strides, padding='SAME')
+      return r
+
+    input_sizes = [1, 3, 2, 4, 1]
+    pool_size = (2, 2, 1)
+    strides = (1, 1, 1)
+
+    total_size = np.prod(input_sizes)
+    x = np.arange(1, total_size + 1, dtype=np.float32)
+    aa = constant_op.constant(x, shape=input_sizes, dtype=dtypes.float32)
+    da = backprop.gradients_function(forward)(aa)
+
+    if not context.executing_eagerly():
+      tf_aa = constant_op.constant(x, shape=input_sizes, dtype=dtypes.float32)
+      tf_max = max_pooling3d(
+          tf_aa, pool_size=pool_size, strides=strides, padding='SAME')
+      tf_da = gradients.gradients(tf_max, [tf_aa])
+      self.assertAllEqual(da[0], tf_da[0].eval())
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/eager/benchmarks_test.py b/tensorflow/python/eager/benchmarks_test.py
index 7913ccf969e..886715867c8 100644
--- a/tensorflow/python/eager/benchmarks_test.py
+++ b/tensorflow/python/eager/benchmarks_test.py
@@ -224,6 +224,18 @@ class MicroBenchmarks(test.Benchmark):
     self._benchmark_create_tensor(
         np.array([[3]], dtype=np.int32), dtypes.int32.as_datatype_enum, GPU)
 
+  def benchmark_index_tensor_with_literal(self):
+    func = lambda: constant_op.constant([3.0])[0]
+    self._run(func, 30000)
+
+  def benchmark_index_tensor_with_tensor(self):
+    func = lambda idx=constant_op.constant(0): constant_op.constant([3.0])[idx]
+    self._run(func, 30000)
+
+  def benchmark_index_tensor_with_np_array(self):
+    func = lambda idx=np.array(0): constant_op.constant([3.0])[idx]
+    self._run(func, 30000)
+
   def _benchmark_np_multiply(self, m, num_iters):
     a = m.cpu().numpy()
     func = lambda: a * a
diff --git a/tensorflow/python/eager/context.py b/tensorflow/python/eager/context.py
index 0986c4b9a6c..e3fef524bf9 100644
--- a/tensorflow/python/eager/context.py
+++ b/tensorflow/python/eager/context.py
@@ -81,6 +81,57 @@ class _EagerTensorCache(object):
     self._data = {}
 
 
+class FunctionCallOptions(object):
+  """Options applied at call sites of eager functions.
+  Eager functions are functions decorated with tf.contrib.eager.defun.
+  """
+
+  def __init__(self, executor_type=None, rewriter_config=None):
+    """Constructor.
+
+    Args:
+      executor_type: (optional) name of the executor to be used to execute the
+        eager function. If None or an empty string, the default Tensorflow
+        executor will be used.
+      rewriter_config: (optional) a rewriter_config_pb2.RewriterConfig proto or
+        a serialized string of that proto.
+        The config used by Grappler when optimizing the function graph.
+        Each concrete function is optimized the first time is called. Changing
+        rewriter_config after the first call has no effect.
+        If rewriter_config is None, an empty RewriterConfig will be used.
+    """
+    self.rewriter_config_serialized = rewriter_config
+    self.executor_type = executor_type
+
+  @property
+  def executor_type(self):
+    return self._executor_type
+
+  @executor_type.setter
+  def executor_type(self, executor_type):
+    self._executor_type = executor_type
+
+  @property
+  def rewriter_config_serialized(self):
+    return self._rewriter_config_serialized
+
+  @rewriter_config_serialized.setter
+  def rewriter_config_serialized(self, config):
+    if isinstance(config, rewriter_config_pb2.RewriterConfig):
+      self._rewriter_config_serialized = config.SerializeToString()
+    elif isinstance(config, str):
+      self._rewriter_config_serialized = config
+    elif config is None:
+      self._rewriter_config_serialized = rewriter_config_pb2.RewriterConfig(
+      ).SerializeToString()
+    else:
+      raise ValueError(
+          "the rewriter config must be either a "
+          "rewriter_config_pb2.RewriterConfig, or a serialized string of that "
+          "proto or None. got: {}"
+          .format(type(config)))
+
+
 # TODO(agarwal): better name ?
 class _EagerContext(threading.local):
   """Thread local eager context."""
@@ -107,7 +158,8 @@ class _EagerContext(threading.local):
         "graph_options") and config.graph_options.HasField("rewrite_options"):
       base_config.Merge(config.graph_options.rewrite_options)
 
-    self.rewriter_config = base_config.SerializeToString()
+    self.function_call_options = FunctionCallOptions(
+        rewriter_config=base_config)
 
 
 ContextSwitch = collections.namedtuple(
@@ -372,36 +424,6 @@ class Context(object):
       if mode == EAGER_MODE:
         self.context_switches.pop()
 
-  @tf_contextlib.contextmanager
-  def rewriter_config(self, rewriter_config_=None):
-    """A context manager to allow setting the grappler rewrite options.
-
-    Args:
-      rewriter_config_: A tensorflow.RewriterConfig proto object.
-
-    Yields:
-      Nothing.
-
-    Raises:
-      ValueError: if rewriter_config is not a tensorflow.RewriterConfig proto.
-    """
-    if rewriter_config_ is None or not isinstance(
-        rewriter_config_, rewriter_config_pb2.RewriterConfig):
-      raise ValueError("Must pass a rewriter_config proto")
-
-    ctx = self._eager_context
-    old_rewriter_config = ctx.rewriter_config
-    ctx.rewriter_config = rewriter_config_.SerializeToString()
-    try:
-      yield
-    finally:
-      ctx.rewriter_config = old_rewriter_config
-
-  @property
-  def rewriter_config_string(self):
-    """Returns the serialized rewriter_config for the current thread."""
-    return self._eager_context.rewriter_config
-
   def executing_eagerly(self):
     """Returns True if current thread has eager executing enabled."""
     return self._eager_context.is_eager
@@ -530,6 +552,35 @@ class Context(object):
     finally:
       self.set_execution_mode(old_mode)
 
+  def get_function_call_options(self):
+    """Returns function call options for current thread.
+
+    Note that the returned object is still referenced by the eager context.
+
+    Returns: the FunctionCallOptions for current thread.
+    """
+    return self._eager_context.function_call_options
+
+  @tf_contextlib.contextmanager
+  def function_call_options(self, set_options_func):
+    """Context manager for setting function call options of current thread.
+
+    Args:
+      set_options_func: A callable that takes one argument of type
+        FunctionCallOptions. It should set the properties of that
+        FunctionCallOptions.
+
+    Yields:
+      Nothing.
+    """
+    current_options = self.get_function_call_options()
+    old_options = copy.copy(current_options)
+    try:
+      set_options_func(current_options)
+      yield
+    finally:
+      self._eager_context.function_call_options = old_options
+
   def async_wait(self):
     """Waits for ops dispatched in ASYNC mode to finish."""
     pywrap_tensorflow.TFE_ContextAsyncWait(self._handle)
@@ -782,6 +833,25 @@ def execution_mode(mode):
   return context().execution_mode(mode)
 
 
+@tf_export("experimental.function_executor_type")
+def function_executor_type(executor_type):
+  """Context manager for setting the executor of eagar defined functions.
+
+  Eager defined functions are functions decorated by tf.contrib.eager.defun.
+
+  Args:
+    executor_type: a string for the name of the executor to be used
+    to execute functions defined by tf.contrib.eager.defun.
+
+  Returns:
+    Context manager for setting the executor of eager defined functions.
+  """
+  def _set_options_func(options):
+    options.executor_type = executor_type
+
+  return context().function_call_options(_set_options_func)
+
+
 def async_wait():
   """Waits for ops dispatched in ASYNC mode to finish."""
   return context().async_wait()
@@ -827,9 +897,23 @@ def export_run_metadata():
   return context().export_run_metadata()
 
 
-def rewriter_config(rewriter_config_):
-  """Context manager for setting the grappler rewrite config."""
-  return context().rewriter_config(rewriter_config_)
+def function_rewriter_config(rewriter_config):
+  """Context manager for setting the grappler rewrite config.
+
+  This config is used by Grappler when optimizing the function graph.
+
+  Args:
+    rewriter_config: a rewriter_config_pb2.RewriterConfig proto or
+      a serialized string of that proto or None. If None, the default instance
+      of rewriter_config_pb2.RewriterConfig will be used.
+
+  Returns:
+    A context manager.
+  """
+  def _set_options_func(options):
+    options.rewriter_config_serialized = rewriter_config
+
+  return context().function_call_options(_set_options_func)
 
 
 def set_server_def(server_def):
diff --git a/tensorflow/python/eager/core.py b/tensorflow/python/eager/core.py
index 8fb69300209..e168b4bd5ff 100644
--- a/tensorflow/python/eager/core.py
+++ b/tensorflow/python/eager/core.py
@@ -60,4 +60,15 @@ class _FallbackException(Exception):
   pass
 
 
+class _SymbolicException(Exception):
+  """Exception class to handle use of symbolic tensors when executing eagerly.
+
+  `keras.Input()` creates symbolic tensors (in a FuncGraph managed by the
+  Keras backend) while in eager execution. This exception is used to
+  identify this case (raised in `convert_to_tensor` cause generated functions
+  for ops to construct graphs instead of executing the kernel).
+  """
+  pass
+
+
 pywrap_tensorflow.TFE_Py_RegisterFallbackExceptionClass(_FallbackException)
diff --git a/tensorflow/python/eager/def_function.py b/tensorflow/python/eager/def_function.py
index a56ddeea81b..cad6721c702 100644
--- a/tensorflow/python/eager/def_function.py
+++ b/tensorflow/python/eager/def_function.py
@@ -19,12 +19,12 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import collections
 import functools
 import weakref
 
 from tensorflow.python.eager import context
 from tensorflow.python.eager import function as function_lib
+from tensorflow.python.eager import lift_to_graph
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
@@ -32,65 +32,7 @@ from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.training.checkpointable import base as checkpointable
 from tensorflow.python.util import tf_decorator
-
-
-def _graph_inputs(op):
-  return [x.op for x in op.inputs] + list(op.control_inputs)
-
-
-def _lift_to_graph(init_tensor, graph):
-  """Copies the tensor and all its inputs recursively to the outer graph."""
-  # Check that the initializer does not depend on any placeholders.
-  visited_ops = set([])
-  ops_to_visit = [init_tensor.op]
-  op_outputs = collections.defaultdict(set)
-  while ops_to_visit:
-    op = ops_to_visit.pop()
-    if op in visited_ops:
-      continue
-    visited_ops.add(op)
-    # TODO(apassos) distinguish arg placeholders, capture placeholders,
-    # and placeholders the user might directly use to initialize
-    # variables.
-    if op.type == "Placeholder":
-      raise ValueError(
-          "Unable to lift tensor", init_tensor,
-          "because it depends transitively on placeholder ", op)
-    for inp in _graph_inputs(op):
-      op_outputs[inp].add(op)
-      if inp not in visited_ops:
-        ops_to_visit.append(inp)
-  # Topologically sort the nodes we've extracted. Now we know how many of their
-  # outputs are part of this subgraph.
-  ops_to_copy = []
-  marked_ops = set([])
-  ops_to_visit = [init_tensor.op]
-  while ops_to_visit:
-    op = ops_to_visit.pop()
-    if op in marked_ops:
-      continue
-    marked_ops.add(op)
-    ops_to_copy.append(op)
-    for inp in _graph_inputs(op):
-      if all(x in marked_ops for x in op_outputs[inp]):
-        ops_to_visit.append(inp)
-  assert len(ops_to_copy) == len(visited_ops)
-  # ops_to_copy now holds a reverse topologically sorted list of ops which
-  # ends in the initializer. We copy those to the outermost graph and
-  # build the initialization op there.
-  with graph.as_default():
-    op_map = {}
-    for op in reversed(ops_to_copy):
-      copied_inputs = [op_map[x] for x in op.inputs]
-      copied_control_inputs = [op_map[x] for x in op.control_inputs]
-      with ops.control_dependencies(copied_control_inputs):
-        copied_op = graph.create_op(
-            op.type, copied_inputs, [x.dtype for x in op.outputs],
-            attrs=op.node_def.attr)
-      op_map[op] = copied_op
-      for i, o in enumerate(op.outputs):
-        op_map[o] = copied_op.outputs[i]
-    return op_map[init_tensor]
+from tensorflow.python.util.tf_export import tf_export
 
 
 class UnliftedInitializerVariable(resource_variable_ops.ResourceVariable):
@@ -110,6 +52,7 @@ class UnliftedInitializerVariable(resource_variable_ops.ResourceVariable):
                name=None,
                dtype=None,
                constraint=None,
+               add_initializers_to=None,
                **unused_kwargs):
     """Creates a variable.
 
@@ -140,6 +83,9 @@ class UnliftedInitializerVariable(resource_variable_ops.ResourceVariable):
         variable and return the Tensor for the projected value
         (which must have the same shape). Constraints are not safe to
         use when doing asynchronous distributed training.
+      add_initializers_to: if not None and not in legacy graph mode, the
+        initializer tensor will be added to this map instead of adding the
+        assignment to the function.
 
     Raises:
       ValueError: If the initial value is not specified, or does not have a
@@ -206,7 +152,8 @@ class UnliftedInitializerVariable(resource_variable_ops.ResourceVariable):
       if self._in_graph_mode:
         with ops.init_scope():
           outer_graph = ops.get_default_graph()
-        lifted_initializer = _lift_to_graph(initial_value, outer_graph)
+        lifted_initializer = lift_to_graph.lift_to_graph(
+            initial_value, outer_graph)[initial_value]
         with ops.init_scope():
           self._initial_value = lifted_initializer
           with ops.name_scope("IsInitialized"):
@@ -224,21 +171,24 @@ class UnliftedInitializerVariable(resource_variable_ops.ResourceVariable):
             self._graph_element = value
           ops.add_to_collection(ops.GraphKeys.GLOBAL_VARIABLES, self)
       else:
-        def assign_fn():
-          with ops.name_scope("Assign") as n, ops.colocate_with(self._handle):
-            resource_variable_ops.assign_variable_op(
-                self._handle,
-                initial_value,
-                name=n)
-            # Returning values to keep tf.cond happy.
-          return ops.convert_to_tensor(1)
-        def not_assign_fn():
-          return ops.convert_to_tensor(0)
-        # Note: this cond is always guaranteed to run because we're inside a
-        # defun which will insert automatic control dependencies.
-        control_flow_ops.cond(
-            resource_variable_ops.var_is_initialized_op(self._handle),
-            not_assign_fn, assign_fn)
+        if add_initializers_to is not None:
+          add_initializers_to[self] = initial_value
+        else:
+          def assign_fn():
+            with ops.name_scope("Assign") as n, ops.colocate_with(self._handle):
+              resource_variable_ops.assign_variable_op(
+                  self._handle,
+                  initial_value,
+                  name=n)
+              # Returning values to keep tf.cond happy.
+            return ops.convert_to_tensor(1)
+          def not_assign_fn():
+            return ops.convert_to_tensor(0)
+          # Note: this cond is always guaranteed to run because we're inside a
+          # defun which will insert automatic control dependencies.
+          control_flow_ops.cond(
+              resource_variable_ops.var_is_initialized_op(self._handle),
+              not_assign_fn, assign_fn)
 
     # After the handle has been created, set up a way to clean it up when
     # executing eagerly. We'll hold the only reference to the deleter, so that
@@ -298,26 +248,26 @@ class PolymorphicFunction(object):
   def _defun_with_scope(self, scope):
     """Creates a defun wrapped inside a variable creator scope."""
 
-    fn = self._python_function
-
     def wrapped_fn(*args, **kwds):
       with variable_scope.variable_creator_scope(scope):
-        return fn(*args, **kwds)
+        # __wrapped__ allows AutoGraph to swap in a converted function.
+        return wrapped_fn.__wrapped__(*args, **kwds)
 
     # TODO(mdan): Pipe self._experimental_autograph_options through.
     return function_lib.defun(
-        tf_decorator.make_decorator(fn, wrapped_fn),
+        tf_decorator.make_decorator(self._python_function, wrapped_fn),
         input_signature=self._input_signature,
-        experimental_autograph=self._autograph)
+        autograph=self._autograph)
 
-  def _initialize(self, args, kwds):
+  def _initialize(self, args, kwds, add_initializers_to=None):
     """Initializes, on the first call."""
 
     self._created_variables = []
 
     def variable_capturing_scope(unused_next_creator, **kwds):
       """Creates UnliftedInitializerVariables and saves references to them."""
-      v = UnliftedInitializerVariable(**kwds)
+      v = UnliftedInitializerVariable(
+          add_initializers_to=add_initializers_to, **kwds)
       self._created_variables.append(weakref.ref(v))
       return v
 
@@ -336,7 +286,12 @@ class PolymorphicFunction(object):
 
     self._stateless_fn = self._defun_with_scope(invalid_creator_scope)
     self._stateless_fn._name = self._name  # pylint: disable=protected-access
-    return self._stateful_fn._canonicalize_function_inputs(*args, **kwds)  # pylint: disable=protected-access
+    if self._input_signature is None or args or kwds:
+      return self._stateful_fn._canonicalize_function_inputs(*args, **kwds)  # pylint: disable=protected-access
+    # If an input signature is defined, we may need to fetch a concrete function
+    # without any inputs specified. In this case args and kwds should be ignored
+    # but running _canonicalize_function_inputs would raise an exception.
+    return (), {}
 
   def __call__(self, *args, **kwds):
     """Calls the graph function."""
@@ -387,6 +342,42 @@ class PolymorphicFunction(object):
     """The python function wrapped in this tf.function."""
     return self._python_function
 
+  def get_initialization_function(self, *args, **kwargs):
+    """Returns a `Function` object which initializes this function's variables.
+
+    Requires that this function hasn't been accessed yet through either calling
+    it or calling get_concrete_function. Fails if we cannot build an initializer
+    function which does not depend on the concrete values of the inputs to this
+    function.
+
+    Args:
+      *args: arguments to the underlying python callable.
+      **kwargs: keyword arguments to the python callable.
+
+    Returns:
+      A `Function` object which initializes the variables of this function.
+
+    Raises:
+      RuntimeError: if called after the variables have been initialized.
+    """
+    if self._stateful_fn is not None:
+      raise RuntimeError(
+          "get_initialization_function cannot be called after the function "
+          "has been used")
+    # Here we trace the function, collect the initializers, and attempt to
+    # extract them and run them eagerly. Fail only if we cannot do so.
+    initializer_map = {}
+    self._initialize(args, kwargs, add_initializers_to=initializer_map)
+
+    # Note: using defun here avoids an infinite recursion.
+    @function_lib.defun
+    def initialize_variables():
+      for v, init in initializer_map.items():
+        v.assign(lift_to_graph.lift_to_graph(
+            init, ops.get_default_graph())[init])
+
+    return initialize_variables.get_concrete_function()
+
   def get_concrete_function(self, *args, **kwargs):
     """Returns a `Function` object specialized to inputs and execution context.
 
@@ -463,14 +454,10 @@ class PolymorphicFunction(object):
     Raises:
       ValueError: if this object has not yet been called on concrete values.
     """
-    # TODO(apassos) figure out how to handle this case (what should we return
-    # here?)
+    assert context.executing_eagerly()
     if self._stateful_fn is None:
-      raise ValueError(
-          "Call this function with concrete values before asking for a"
-          " concrete function. Calling the function will ensure that, in"
-          " case this function creates variables, that those are properly"
-          " initialized.")
+      self.get_initialization_function(*args, **kwargs)()
+
     if self._created_variables:
       # In this case we have created variables on the first call, so we run the
       # defunned version which is guaranteed to never create variables.
@@ -512,11 +499,207 @@ class PolymorphicFunction(object):
     return self._descriptor_cache[instance]
 
 
+# In TensorFlow 1.x, exported as tf.contrib.eager.function
+@tf_export("function", v1=[])
 def function(func=None,
              input_signature=None,
              autograph=False,
              experimental_autograph_options=None):
-  """Defines a function as per the "functions, not sessions" document."""
+  """Creates a callable TensorFlow graph from a Python function.
+
+  `function` constructs a callable that executes a TensorFlow graph
+  (`tf.Graph`) created by tracing the TensorFlow operations in `func`.
+  This allows the TensorFlow runtime to apply optimizations and exploit
+  parallelism in the computation defined by `func`.
+
+  _Example Usage_
+
+  ```python
+  def f(x, y):
+    return tf.reduce_mean(tf.multiply(x ** 2, 3) + y)
+
+  g = tf.function(f)
+
+  x = tf.constant([[2.0, 3.0]])
+  y = tf.constant([[3.0, -2.0]])
+
+  # `f` and `g` will return the same value, but `g` will be executed as a
+  # TensorFlow graph.
+  assert f(x, y).numpy() == g(x, y).numpy()
+
+  # Tensors and tf.Variables used by the Python function are captured in the
+  # traced graph.
+  @tf.function
+  def h():
+    return f(x, y)
+
+  assert (h().numpy() == f(x, y).numpy()).all()
+  ```
+
+  _Referencing `tf.Variable`s_
+
+  The Python function `func` may reference stateful objects (such as
+  `tf.Variable`).
+  These are captured as implicit inputs to the callable returned by `function`.
+  For example:
+
+  ```python
+  c = tf.Variable(0)
+
+  @tf.function
+  def f(x):
+    c.assign_add(1)
+    return x + tf.to_float(c)
+
+  assert int(c) == 0
+  assert f(1.0) == 3.0
+  assert int(c) == 1
+  assert f(1.0) == 4.0
+  assert int(c) == 2
+  ```
+
+  `function` can be applied to methods of an object. For example:
+
+  ```python
+  class Dense(object):
+    def __init__(self):
+      self.W = tf.Variable(tf.glorot_uniform_initializer()((10, 10)))
+      self.b = tf.Variable(tf.zeros(10))
+
+    @tf.function
+    def compute(self, x):
+      return tf.matmul(x, self.W) + self.b
+
+  d1 = Dense()
+  d2 = Dense()
+  x = tf.random_uniform((10, 10))
+  # d1 and d2 are using distinct variables
+  assert not (d1.compute(x).numpy() == d2.compute(x).numpy()).all()
+  ```
+
+  _Usage with `tf.keras`_
+
+  The `call` methods of a `tf.keras.Model` subclass can be decorated with
+  `function` in order to apply graph execution optimizations on it.
+  For example:
+
+  ```python
+  class MyModel(tf.keras.Model):
+    def __init__(self, keep_probability=0.2):
+      super(MyModel, self).__init__()
+      self.dense1 = tf.keras.layers.Dense(4)
+      self.dense2 = tf.keras.layers.Dense(5)
+      self.keep_probability = keep_probability
+
+    @tf.function
+    def call(self, inputs, training=True):
+      y = self.dense2(self.dense1(inputs))
+      if training:
+        return tf.nn.dropout(y, self.keep_probability)
+      else:
+        return y
+
+  model = MyModel()
+  model(x, training=True)  # executes a graph, with dropout
+  model(x, training=False) # executes a graph, without dropout
+  ```
+
+  _Input Signatures_
+  `function` instantiates a separate graph for every unique set of input
+  shapes and datatypes. For example, the following code snippet will result
+  in three distinct graphs being traced, as each input has a different
+  shape.
+
+  ```python
+  @tf.function
+  def f(x): return tf.add(x, 1.)
+
+  scalar = tf.constant(1.0)
+  vector = tf.constant([1.0, 1.0])
+  matrix = tf.constant([[3.0]])
+
+  f(scalar)
+  f(vector)
+  f(matrix)
+  ```
+
+  An "input signature" can be optionally provided to `function` to control
+  the graphs traced. The input signature specifies the shape and type of each
+  `Tensor` argument to the function using a `tf.TensorSpec` object. For example,
+  the following code snippet ensures that a single graph is created where the
+  input `Tensor` is required to be a floating point tensor with no restrictions
+  on shape.
+
+  ```python
+  @tf.function(input_signature=[tf.TensorSpec(shape=None, dtype=tf.float32)])
+  def f(x): return tf.add(x, 1.)
+  ```
+
+  When an `input_signature` is specified, the callable will only accept `Tensor`
+  (or NumPy `ndarray`) objects as arguments.
+
+  _Tracing_
+  Note that `function` only traces TensorFlow operations, all the other
+  Python code that `func` executes will shape the _construction_ of the graph.
+  For example, consider the following:
+
+  ```python
+  import numpy as np
+
+  def add_noise():
+    return tf.eye(5) + np.random.randn(5, 5)
+
+  traced = tf.function(add_noise)
+  ```
+
+  `add_noise()` will return a different output every time it is invoked.
+  However, `traced` will return the same value every time it is called, since a
+  particular random value generated by the `np.random.randn` call will be
+  inserted in the traced TensorFlow graph as a constant. In this particular
+  example, replacing `np.random.randn(5, 5)` with `tf.random_normal((5, 5))`
+  will result in the same behavior for `add_noise()` and `traced()`.
+
+  _Python Side-Effects_
+  A corollary of the previous discussion on tracing is the following: If a
+  Python function `func` has Python side-effects, then executing `func` multiple
+  times
+  may not be semantically equivalent to executing `F = tf.function(func)`
+  multiple times; this difference is due to the fact that `function` only
+  captures the subgraph of TensorFlow operations that is constructed when `func`
+  is invoked to trace a graph.
+
+  Args:
+    func: function to be compiled. If `func` is None, returns a decorator that
+      can be invoked with a single argument - `func`. The end result is
+      equivalent to providing all the arguments up front. In other words,
+      `tf.function(input_signature=...)(func)` is equivalent to
+      `tf.function(func, input_signature=...)`. The former can be used to
+      decorate Python functions, for example:
+        @tf.function(input_signature=...)
+        def foo(...): ...
+    input_signature: A possibly nested sequence of `tf.TensorSpec` objects
+      specifying the shapes and dtypes of the Tensors that will be supplied to
+      this function. If `None`, a separate function is instantiated for each
+      inferred input signature.  If input_signature is specified, every input to
+      `func` must be a `Tensor`, and `func` cannot accept `**kwargs`.
+    autograph: Whether autograph should be applied on `func` before tracing a
+      graph. This allows for dynamic control flow (Python if's, loops etc.)
+      in the traced graph. See https://www.tensorflow.org/guide/autograph for
+        more information.
+    experimental_autograph_options: Experimental knobs (in the form of a tuple
+      of tensorflow.autograph.Feature values) to control behavior when
+      autograph=True.
+
+  Returns:
+     If `func` is not None, returns a callable that will execute the compiled
+     function (and return zero or more `tf.Tensor` objects).
+     If `func` is None, returns a decorator that, when invoked with a single
+     `func` argument, returns a callable equivalent to the case above.
+
+  Raises:
+    TypeError: If `input_signature` is neither `None` nor a sequence of
+      `TensorSpec` objects.
+  """
   if input_signature is not None:
     function_lib.validate_signature(input_signature)
 
diff --git a/tensorflow/python/eager/def_function_test.py b/tensorflow/python/eager/def_function_test.py
index 543dcd19ae8..f0f71a219e6 100644
--- a/tensorflow/python/eager/def_function_test.py
+++ b/tensorflow/python/eager/def_function_test.py
@@ -27,6 +27,7 @@ from tensorflow.python.framework import tensor_spec
 from tensorflow.python.keras.engine import training
 from tensorflow.python.keras.layers import core
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 from tensorflow.python.training import adam
@@ -106,6 +107,23 @@ class DefFunctionTest(test.TestCase):
 
     self.assertAllEqual(fn(constant_op.constant(1.0)), 2.0)
 
+  def testFunctionInitializationFunction(self):
+
+    state = []
+
+    @def_function.function
+    def fn(x):
+      if not state:
+        state.append(variables.Variable(2.0))
+      return state[0] * x
+
+    init_fn = fn.get_initialization_function(constant_op.constant(1.0))
+    self.assertEqual(len(state), 1)
+    self.assertFalse(
+        resource_variable_ops.var_is_initialized_op(state[0].handle))
+    init_fn()
+    self.assertEqual(state[0].numpy(), 2.0)
+
   def testVariableInitializerNotConstant(self):
 
     state = []
@@ -195,6 +213,19 @@ class DefFunctionTest(test.TestCase):
     model = _ModelWithOptimizer()
     model(x, y)
 
+  def test_concrete_function_from_signature(self):
+
+    @def_function.function(
+        input_signature=[tensor_spec.TensorSpec(None, dtypes.float32)])
+    def compute(x):
+      return 2. * x
+
+    concrete = compute.get_concrete_function()
+    self.assertAllClose(1., concrete(constant_op.constant(0.5)))
+    concrete = compute.get_concrete_function(
+        tensor_spec.TensorSpec(None, dtypes.float32))
+    self.assertAllClose(4., concrete(constant_op.constant(2.)))
+
 
 if __name__ == '__main__':
   ops.enable_eager_execution()
diff --git a/tensorflow/python/eager/execute.py b/tensorflow/python/eager/execute.py
index f9b8d2cb5db..6f8c780170c 100644
--- a/tensorflow/python/eager/execute.py
+++ b/tensorflow/python/eager/execute.py
@@ -64,6 +64,16 @@ def quick_execute(op_name, num_outputs, inputs, attrs, ctx, name=None):
     else:
       message = e.message
     six.raise_from(core._status_to_exception(e.code, message), None)
+  except TypeError as e:
+    if any(ops._is_keras_symbolic_tensor(x) for x in inputs):
+      if any(isinstance(x, ops.EagerTensor) for x in inputs):
+        raise TypeError("You are attempting to mix computation of symbolic "
+                        "Tensors (computation rooted at tf.keras.Input()) "
+                        "and concrete values. This is not supported. "
+                        "If you need this support, file an issue on the "
+                        "TensorFlow GitHub repository.")
+      raise core._SymbolicException
+    raise e
   # pylint: enable=protected-access
   return tensors
 
@@ -188,7 +198,10 @@ def args_to_matching_eager(l, ctx, default_dtype=None):
     ret = []
     for t in l:
       ret.append(internal_convert_to_tensor(
-          t, dtype, preferred_dtype=default_dtype, ctx=ctx))
+          t, dtype,
+          preferred_dtype=default_dtype,
+          ctx=ctx,
+          accept_symbolic_tensors=False))
       if dtype is None:
         dtype = ret[-1].dtype
   else:
diff --git a/tensorflow/python/eager/function.py b/tensorflow/python/eager/function.py
index f6c54e05d26..c429dd359bb 100644
--- a/tensorflow/python/eager/function.py
+++ b/tensorflow/python/eager/function.py
@@ -254,12 +254,14 @@ class _EagerDefinedFunction(object):
         raise ValueError(
             "Arguments and signature arguments do not match: %s %s " %
             (len(args), len(list(self.signature.input_arg))))
+      function_call_options = ctx.get_function_call_options()
       outputs = functional_ops.partitioned_call(
           args=args,
           f=self,
           tout=self._output_types,
           executing_eagerly=executing_eagerly,
-          config=ctx.rewriter_config_string)  # pylint: disable=protected-access
+          config=function_call_options.rewriter_config_serialized,
+          executor_type=function_call_options.executor_type)
 
     if executing_eagerly:
       return outputs
@@ -339,7 +341,7 @@ class Function(object):
               "wrap_function-decorated function.")
         return self._call_flat(args)
       raise AssertionError(
-          "Tried to call a concrete function obtained from an interal API "
+          "Tried to call a concrete function obtained from an internal API "
           "through the public interface. Use get_concrete_function instead.")
     if len(args) > self._num_positional_args:
       raise TypeError(
@@ -722,7 +724,7 @@ class PolymorphicFunction(object):
                name,
                input_signature=None,
                attributes=None,
-               experimental_autograph=False):
+               autograph=False):
     """Initializes a polymorphic function.
 
     Args:
@@ -733,7 +735,7 @@ class PolymorphicFunction(object):
         function is instantiated for each inferred input signature.
       attributes: dict, extra keyword arguments that will be added as attribute
         of the function.
-      experimental_autograph: whether to use autograph to compile
+      autograph: whether to use autograph to compile
         `python_function`. See https://www.tensorflow.org/guide/autograph for
         more information.
 
@@ -751,7 +753,7 @@ class PolymorphicFunction(object):
       self._args_to_prepend = tuple()
       self._kwargs_to_include = {}
     self._name = name
-    self._experimental_autograph = experimental_autograph
+    self._autograph = autograph
     self._function_cache = collections.OrderedDict()
     self._function_attributes = attributes or {}
 
@@ -1095,7 +1097,7 @@ class PolymorphicFunction(object):
                 args,
                 kwargs,
                 self._input_signature,
-                experimental_autograph=self._experimental_autograph,
+                autograph=self._autograph,
                 arg_names=arg_names),
             self._function_attributes)
         self._function_cache[cache_key] = graph_function
@@ -1135,7 +1137,7 @@ def validate_signature(signature):
                     "a possibly nested sequence of TensorSpec objects.")
 
 
-def defun(func=None, input_signature=None, experimental_autograph=False):
+def defun(func=None, input_signature=None, autograph=False):
   """Compiles a Python function into a callable TensorFlow graph.
 
   `defun` (short for "define function") trace-compiles a Python function
@@ -1444,7 +1446,7 @@ def defun(func=None, input_signature=None, experimental_autograph=False):
       function is instantiated for each inferred input signature.  If a
       signature is specified, every input to `func` must be a `Tensor`, and
       `func` cannot accept `**kwargs`.
-    experimental_autograph: Whether `func` should be compiled before
+    autograph: Whether `func` should be compiled before
       constructing the graph. See https://www.tensorflow.org/guide/autograph
       for more information.
 
@@ -1462,13 +1464,13 @@ def defun(func=None, input_signature=None, experimental_autograph=False):
   return defun_with_attributes(
       func=func,
       input_signature=input_signature,
-      experimental_autograph=experimental_autograph)
+      autograph=autograph)
 
 
 def defun_with_attributes(func=None,
                           input_signature=None,
                           attributes=None,
-                          experimental_autograph=False):
+                          autograph=False):
   """Compiles a Python function into a callable TensorFlow graph.
 
   This function supports adding extra function attributes. See detailed
@@ -1485,7 +1487,7 @@ def defun_with_attributes(func=None,
       unsupported value will result into ValueError. `func_name` is also one of
       the whitelisted argument which is a python string, and sets the name for
       this `Function` in the graph.
-    experimental_autograph: same as defun()'s experimental_autograph.
+    autograph: same as defun()'s autograph.
 
   Returns:
     Same as the return value of defun, with attributes added to the function in
@@ -1510,7 +1512,7 @@ def defun_with_attributes(func=None,
             name,
             input_signature=input_signature,
             attributes=attributes,
-            experimental_autograph=experimental_autograph))
+            autograph=autograph))
 
   # This code path is for the `foo = tfe.defun(foo, ...)` use case
   if func is not None:
@@ -1526,22 +1528,49 @@ def defun_with_attributes(func=None,
   return decorated
 
 
+# When a method is bound to objects of this type, it allows AutoGraph to
+# recover a weak reference the original method's self pointer. This uses the
+# mechanism from pyct.inspect_utils.getmethodclass.
+# TODO(b/119246461): This is not pretty. Use a descriptor instead?
+class _WeakrefSelf(object):
+
+  def __init__(self, target):
+    self.ag_self_weakref__ = target
+
+
 def class_method_to_instance_method(original_function, instance):
   """Constructs a new PolymorphicFunction with `self` bound."""
-  def make_partial_py_func(py_func, weak_instance):
-    return lambda *args, **kwargs: py_func(weak_instance(), *args, **kwargs)
   weak_instance = weakref.ref(instance)
 
+  # Note: while we could bind to a weakref proxy instead, that causes the
+  # bound method to be unhashable.
+  bound_method = types_lib.MethodType(original_function.python_function,
+                                      _WeakrefSelf(weak_instance))
+
+  # original_function is expected to be of one of the two PolymorphicFunction
+  # types (defined either in function.py or def_function.py).
+  assert hasattr(original_function, "_name")
+  assert hasattr(original_function, "_autograph")
+  assert hasattr(original_function, "_input_signature")
+  assert hasattr(original_function, "python_function")
+
+  def bound_method_wrapper(*args, **kwargs):
+    # __wrapped__ allows AutoGraph to swap in a converted function.
+    wrapped_fn = bound_method_wrapper.__wrapped__
+    # If __wrapped__ was not replaced, then call original_function.
+    # TODO(b/119246461): This needs to be simplified.
+    if tf_inspect.ismethod(wrapped_fn):
+      wrapped_fn = original_function.python_function
+    return wrapped_fn(weak_instance(), *args, **kwargs)
+
   # pylint: disable=protected-access
   # We make a dummy MethodType object to generate the correct bound method
   # signature. The actual call is to a function with a weak reference to
   # `instance`.
   instance_func = type(original_function)(
-      tf_decorator.make_decorator(
-          types_lib.MethodType(original_function.python_function, False),
-          make_partial_py_func(original_function.python_function,
-                               weak_instance)),
+      tf_decorator.make_decorator(bound_method, bound_method_wrapper),
       name=original_function._name,
+      autograph=original_function._autograph,
       input_signature=original_function._input_signature)
   # pylint: enable=protected-access
 
diff --git a/tensorflow/python/eager/function_argument_naming_test.py b/tensorflow/python/eager/function_argument_naming_test.py
new file mode 100644
index 00000000000..9358c4fd071
--- /dev/null
+++ b/tensorflow/python/eager/function_argument_naming_test.py
@@ -0,0 +1,258 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.eager import def_function
+from tensorflow.python.eager import function
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_spec
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+
+@parameterized.named_parameters(
+    dict(testcase_name='Defun', function_decorator=function.defun),
+    dict(testcase_name='DefFunction', function_decorator=def_function.function))
+class ArgumentNamingTests(test.TestCase, parameterized.TestCase):
+  """Tests for recognizable export signatures from concrete functions."""
+
+  def testBasic(self, function_decorator):
+    @function_decorator
+    def fn(a, b):
+      return a + b, a * b
+    # Call the function to make def_function happy
+    fn(array_ops.ones([]), array_ops.ones([]))
+
+    fn_op = fn.get_concrete_function(
+        tensor_spec.TensorSpec(shape=(None,), dtype=dtypes.float32),
+        tensor_spec.TensorSpec(shape=(), dtype=dtypes.float32))
+    self.assertEqual(
+        ['a', 'b'],
+        [inp.op.name for inp in fn_op.inputs])
+    self.assertEqual(
+        [b'a', b'b'],
+        [inp.op.get_attr('_user_specified_name') for inp in fn_op.inputs])
+    self.assertEqual(2, len(fn_op.graph.structured_outputs))
+    self.assertAllClose(
+        [3., 2.],
+        fn_op(constant_op.constant(1.), constant_op.constant(2.)))
+    self.assertAllClose(
+        [3., 2.],
+        fn_op(a=constant_op.constant(1.), b=constant_op.constant(2.)))
+
+  def testVariable(self, function_decorator):
+    @function_decorator
+    def fn(a, b):
+      return a + b, a * b
+    # Call the function to make def_function happy
+    fn(array_ops.ones([]), array_ops.ones([]))
+
+    fn_op = fn.get_concrete_function(
+        tensor_spec.TensorSpec(shape=(None,), dtype=dtypes.float32),
+        variables.Variable(1.))
+    self.assertEqual(
+        ['a', 'b'],
+        [inp.op.name for inp in fn_op.inputs])
+    self.assertEqual(
+        [b'a', b'b'],
+        [inp.op.get_attr('_user_specified_name') for inp in fn_op.inputs])
+    self.assertEqual(2, len(fn_op.graph.structured_outputs))
+
+  def testDictReturned(self, function_decorator):
+    @function_decorator
+    def fn(x, z=(1., 2.), y=3.):
+      z1, z2 = z
+      return {'alpha': x + y + z1, 'beta': x * y + z2}
+    # Call the function to make def_function happy
+    fn(array_ops.ones([]))
+
+    fn_op = fn.get_concrete_function(
+        x=tensor_spec.TensorSpec(shape=(None,), dtype=dtypes.float32),
+        y=tensor_spec.TensorSpec(shape=(), dtype=dtypes.float32))
+    self.assertEqual(
+        ['x', 'y'],
+        [inp.op.name for inp in fn_op.inputs])
+    self.assertEqual(
+        [b'x', b'y'],
+        [inp.op.get_attr('_user_specified_name') for inp in fn_op.inputs])
+    self.assertEqual({'alpha', 'beta'},
+                     set(fn_op.graph.structured_outputs.keys()))
+
+    with self.assertRaisesRegexp(ValueError, "two arguments named 'z'"):
+      fn.get_concrete_function(
+          z=(tensor_spec.TensorSpec(shape=(None,), dtype=dtypes.float32),
+             tensor_spec.TensorSpec(shape=(), dtype=dtypes.float32)),
+          y=tensor_spec.TensorSpec(shape=(), dtype=dtypes.float32,
+                                   name='custom'),
+          x=4.)
+    fn_op2 = fn.get_concrete_function(
+        z=(tensor_spec.TensorSpec(shape=(None,), dtype=dtypes.float32,
+                                  name='z_first'),
+           tensor_spec.TensorSpec(shape=(), dtype=dtypes.float32,
+                                  name='z_second')),
+        y=tensor_spec.TensorSpec(shape=(), dtype=dtypes.float32, name='custom'),
+        x=4.)
+    self.assertEqual(
+        ['z_first', 'z_second', 'custom'],
+        [inp.op.name for inp in fn_op2.inputs])
+    self.assertEqual(
+        [b'z_first', b'z_second', b'custom'],
+        [inp.op.get_attr('_user_specified_name') for inp in fn_op2.inputs])
+
+    fn_op3 = fn.get_concrete_function(
+        tensor_spec.TensorSpec(shape=(), dtype=dtypes.float32, name='custom'),
+        z=(tensor_spec.TensorSpec(shape=(None,), dtype=dtypes.float32,
+                                  name='z1'),
+           tensor_spec.TensorSpec(shape=(), dtype=dtypes.float32, name='z2')),
+        y=tensor_spec.TensorSpec(shape=(), dtype=dtypes.float32))
+    self.assertEqual(
+        ['custom', 'z1', 'z2', 'y'],
+        [inp.op.name for inp in fn_op3.inputs])
+    self.assertEqual(
+        [b'custom', b'z1', b'z2', b'y'],
+        [inp.op.get_attr('_user_specified_name') for inp in fn_op3.inputs])
+
+  def testMethod(self, function_decorator):
+    class HasMethod(object):
+
+      @function_decorator
+      def method(self, x):
+        return x
+
+    has_method = HasMethod()
+    # Call the function to make def_function happy
+    HasMethod.method(has_method, array_ops.ones([]))
+    class_op = HasMethod.method.get_concrete_function(
+        has_method, tensor_spec.TensorSpec(shape=(), dtype=dtypes.float32))
+    self.assertEqual(
+        ['x'],
+        [inp.op.name for inp in class_op.inputs])
+    self.assertEqual(
+        [b'x'],
+        [inp.op.get_attr('_user_specified_name') for inp in class_op.inputs])
+    # Call the function to make def_function happy
+    has_method.method(array_ops.ones([]))
+    method_op = has_method.method.get_concrete_function(
+        tensor_spec.TensorSpec(shape=(), dtype=dtypes.float32))
+    self.assertEqual(
+        ['x'],
+        [inp.op.name for inp in method_op.inputs])
+    self.assertEqual(
+        [b'x'],
+        [inp.op.get_attr('_user_specified_name') for inp in method_op.inputs])
+    # TODO(allenl): It should be possible to override names when exporting. Do
+    # TensorSpec names need to go in cache keys? Or maybe get_concrete_function
+    # should always retrace?
+    self.skipTest('Not working')
+    method_op = has_method.method.get_concrete_function(
+        tensor_spec.TensorSpec(shape=(), dtype=dtypes.float32, name='y'))
+    self.assertEqual(
+        ['y'],
+        [inp.op.name for inp in method_op.inputs])
+    self.assertEqual(
+        [b'y'],
+        [inp.op.get_attr('_user_specified_name') for inp in method_op.inputs])
+
+  def testMethodSignature(self, function_decorator):
+
+    class HasMethod(object):
+
+      @function_decorator(
+          input_signature=(tensor_spec.TensorSpec(
+              shape=None, dtype=dtypes.float64, name='y'),))
+      def method(self, x):
+        hash(self)  # No weak proxies passed as `self`
+        return x
+
+    has_method = HasMethod()
+    # Call the function to make def_function happy
+    has_method.method(array_ops.ones([], dtype=dtypes.float64))
+    method_op = has_method.method.get_concrete_function()
+    self.assertEqual(
+        ['y'],
+        [inp.op.name for inp in method_op.inputs])
+    self.assertEqual(
+        [b'y'],
+        [inp.op.get_attr('_user_specified_name') for inp in method_op.inputs])
+    method_op2 = has_method.method.get_concrete_function()
+    self.assertEqual(
+        ['y'],
+        [inp.op.name for inp in method_op2.inputs])
+    self.assertEqual(
+        [b'y'],
+        [inp.op.get_attr('_user_specified_name') for inp in method_op2.inputs])
+
+  def testVariadic(self, function_decorator):
+    @function_decorator
+    def variadic_fn(x, *args, **kwargs):
+      return x + math_ops.add_n(list(args) + list(kwargs.values()))
+
+    # Call the function to make def_function happy
+    variadic_fn(array_ops.ones([]), array_ops.ones([]))
+    variadic_op = variadic_fn.get_concrete_function(
+        tensor_spec.TensorSpec(shape=(), dtype=dtypes.float32),
+        tensor_spec.TensorSpec(shape=None, dtype=dtypes.float32, name='y'),
+        tensor_spec.TensorSpec(shape=(), dtype=dtypes.float32),
+        tensor_spec.TensorSpec(shape=(), dtype=dtypes.float32,
+                               name='second_variadic'),
+        z=tensor_spec.TensorSpec(shape=(), dtype=dtypes.float32),
+        zz=tensor_spec.TensorSpec(shape=(), dtype=dtypes.float32, name='cust'))
+    self.assertEqual(
+        ['x', 'y', 'args', 'second_variadic', 'z', 'cust'],
+        [inp.op.name for inp in variadic_op.inputs])
+    self.assertEqual(
+        [b'x', b'y', b'args', b'second_variadic', b'z', b'cust'],
+        [inp.op.get_attr('_user_specified_name')
+         for inp in variadic_op.inputs])
+
+  def testVariadicInputSignature(self, function_decorator):
+    @function_decorator(
+        input_signature=(
+            tensor_spec.TensorSpec(shape=None, dtype=dtypes.float32),
+            tensor_spec.TensorSpec(shape=None, dtype=dtypes.float32, name='y'),
+            tensor_spec.TensorSpec(shape=(), dtype=dtypes.float32),
+            tensor_spec.TensorSpec(shape=(), dtype=dtypes.float32, name='z'),
+        ))
+    def variadic_fn(x, *args):
+      return x + math_ops.add_n(list(args))
+
+    # Call the function to make def_function happy
+    variadic_fn(array_ops.ones([]), array_ops.ones([]),
+                array_ops.ones([]), array_ops.ones([]))
+    variadic_op = variadic_fn.get_concrete_function()
+    self.assertIn(b'variadic_fn', variadic_op.name)
+    self.assertEqual(
+        ['x', 'y', 'args', 'z'],
+        [inp.op.name for inp in variadic_op.inputs])
+    self.assertEqual(
+        [b'x', b'y', b'args', b'z'],
+        [inp.op.get_attr('_user_specified_name')
+         for inp in variadic_op.inputs])
+
+
+if __name__ == '__main__':
+  ops.enable_eager_execution(
+      config=config_pb2.ConfigProto(device_count={'CPU': 4}))
+  test.main()
diff --git a/tensorflow/python/eager/function_defun_collection_test.py b/tensorflow/python/eager/function_defun_collection_test.py
new file mode 100644
index 00000000000..53478ad121c
--- /dev/null
+++ b/tensorflow/python/eager/function_defun_collection_test.py
@@ -0,0 +1,102 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.eager import def_function
+from tensorflow.python.eager import function
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+
+class DefunCollectionTest(test.TestCase, parameterized.TestCase):
+
+  @parameterized.named_parameters(
+      dict(testcase_name='Defun', function_decorator=function.defun),
+      dict(
+          testcase_name='DefFunction',
+          function_decorator=def_function.function))
+  def testCollectionValueAccess(self, function_decorator):
+    """Read values from graph collections inside of defun."""
+    with ops.Graph().as_default() as g:
+      with self.session(graph=g):
+        x = 2
+        y = 5
+        ops.add_to_collection('x', x)
+        ops.add_to_collection('y', y)
+
+        @function_decorator
+        def fn():
+          x_const = constant_op.constant(ops.get_collection('x')[0])
+          y_const = constant_op.constant(ops.get_collection('y')[0])
+          z = math_ops.add(x_const, y_const)
+          ops.add_to_collection('z', 7)
+          return z
+
+        self.assertEqual(7, int(self.evaluate(fn())))
+        self.assertEquals(ops.get_collection('x'), [2])
+        self.assertEquals(ops.get_collection('y'), [5])
+        self.assertEquals(ops.get_collection('z'), [])
+
+  @parameterized.named_parameters(
+      dict(testcase_name='Defun', function_decorator=function.defun),
+      dict(
+          testcase_name='DefFunction',
+          function_decorator=def_function.function))
+  def testCollectionVariableValueAccess(self, function_decorator):
+    """Read variable value from graph collections inside of defun."""
+    with ops.Graph().as_default() as g:
+      with self.session(graph=g):
+        v = resource_variable_ops.ResourceVariable(1.0)
+
+        @function_decorator
+        def f():
+          return v.read_value()
+
+        self.evaluate(variables.global_variables_initializer())
+        self.assertEqual(1.0, float(self.evaluate(f())))
+        self.assertEquals(
+            len(ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)), 1)
+
+  def testCollectionVariableValueWrite(self):
+    """Write variable value inside defun."""
+    with ops.Graph().as_default() as g:
+      with self.session(graph=g):
+
+        @function.defun
+        def f():
+          v = resource_variable_ops.ResourceVariable(2.0)
+          return v
+
+        _ = f.get_concrete_function()
+        self.evaluate(variables.global_variables_initializer())
+        self.assertEqual(2.0, float(self.evaluate(f())))
+        self.assertEquals(
+            len(ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)), 1)
+
+
+if __name__ == '__main__':
+  ops.enable_eager_execution(
+      config=config_pb2.ConfigProto(device_count={'CPU': 4}))
+  test.main()
diff --git a/tensorflow/python/eager/function_test.py b/tensorflow/python/eager/function_test.py
index 651d6cec724..2af08689f87 100644
--- a/tensorflow/python/eager/function_test.py
+++ b/tensorflow/python/eager/function_test.py
@@ -105,7 +105,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     # The default config allows everything.
     rewrites = rewriter_config_pb2.RewriterConfig()
 
-    with context.rewriter_config(rewrites):
+    with context.function_rewriter_config(rewrites):
       t = constant_op.constant(1.0)
       self.assertAllEqual(add(t, t).numpy(), 2.0)
 
@@ -190,7 +190,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
   def testBasicGraphFunction(self):
     matmul = def_function.function(math_ops.matmul)
 
-    @function.defun
+    @def_function.function
     def sq(a):
       return matmul(a, a)
 
@@ -204,7 +204,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
   def testInputSpecGraphFunction(self):
     matmul = def_function.function(math_ops.matmul)
 
-    @function.defun
+    @def_function.function
     def sq(a):
       return matmul(a, a)
 
@@ -223,7 +223,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
   def testNestedInputSpecGraphFunction(self):
     matmul = def_function.function(math_ops.matmul)
 
-    @function.defun
+    @def_function.function
     def sq(mats):
       ((a, b),) = mats
       return matmul(a, b)
@@ -347,7 +347,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
 
     pair = collections.namedtuple('pair', ['a', 'b'])
 
-    @function.defun
+    @def_function.function
     def a_times_b(inputs):
       return matmul(inputs.a['a'], inputs.b['b'])
 
@@ -362,7 +362,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
   def testNestedOutputGraphFunction(self):
     matmul = def_function.function(math_ops.matmul)
 
-    @function.defun
+    @def_function.function
     def sq(a):
       return (matmul(a, a), {'b': constant_op.constant(1.0)})
 
@@ -381,7 +381,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
   def testGraphFunctionWithGradients(self):
     v = resource_variable_ops.ResourceVariable(1.0, name='v')
 
-    @function.defun
+    @def_function.function
     def step():
       def inner():
         return v * v
@@ -394,7 +394,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     self.assertAllEqual(step_op(), 2.0)
 
   def testGraphFunctionNoneOutput(self):
-    @function.defun
+    @def_function.function
     def fn(unused_a, unused_b):
       return None
 
@@ -968,7 +968,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
       v_gpu = resource_variable_ops.ResourceVariable(
           [0.0, 1.0, 2.0], name='gpu')
 
-    @function.defun
+    @def_function.function
     def resource_apply_adam():
       training_ops.resource_apply_adam(
           v_cpu.handle,
@@ -1040,11 +1040,11 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
 
   @test_util.run_in_graph_and_eager_modes
   def testNestedDifferentiableFunction(self):
-    @function.defun
+    @def_function.function
     def inner_fn(a, b):
       return a * math_ops.add(a, b)
 
-    @function.defun
+    @def_function.function
     def outer_fn(x):
       return inner_fn(x, 1.0)
 
@@ -1058,19 +1058,19 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
 
   @test_util.run_in_graph_and_eager_modes
   def testDeeplyNestedDifferentiableFunction(self):
-    @function.defun
+    @def_function.function
     def inner_inner_fn(a, b):
       return math_ops.add(a, b)
 
-    @function.defun
+    @def_function.function
     def inner_fn(a, b):
       return inner_inner_fn(a, b)
 
-    @function.defun
+    @def_function.function
     def middle_fn(a, b):
       return a * inner_fn(a, b)
 
-    @function.defun
+    @def_function.function
     def outer_fn(x):
       return middle_fn(x, 1.0)
 
@@ -1084,15 +1084,15 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
 
   @test_util.run_in_graph_and_eager_modes
   def testDeeplyNestedDifferentiableFunctionWithMultipleGradCalls(self):
-    @function.defun
+    @def_function.function
     def inner_fn(a, b):
       return math_ops.add(a, b)
 
-    @function.defun
+    @def_function.function
     def middle_fn(a, b):
       return math_ops.mul(a, inner_fn(a, b))
 
-    @function.defun
+    @def_function.function
     def outer_fn(x):
       return middle_fn(x, 3.0)
 
@@ -1132,19 +1132,19 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
 
   @test_util.run_in_graph_and_eager_modes
   def testDeeplyNestedDifferentiableFunctionGradientTapeInDefun(self):
-    @function.defun
+    @def_function.function
     def inner_inner_fn(a, b):
       return math_ops.add(a, b)
 
-    @function.defun
+    @def_function.function
     def inner_fn(a, b):
       return inner_inner_fn(a, b)
 
-    @function.defun
+    @def_function.function
     def middle_fn(a, b):
       return a * inner_fn(a, b)
 
-    @function.defun
+    @def_function.function
     def outer_fn(x):
       with backprop.GradientTape() as tp:
         tp.watch(x)
@@ -1158,19 +1158,19 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
 
   @test_util.run_in_graph_and_eager_modes
   def testDeeplyNestedDifferentiableFunctionGradientTapeInNestedDefun(self):
-    @function.defun
+    @def_function.function
     def inner_inner_fn(a, b):
       return math_ops.add(a, b)
 
-    @function.defun
+    @def_function.function
     def inner_fn(a, b):
       return inner_inner_fn(a, b)
 
-    @function.defun
+    @def_function.function
     def middle_fn(a, b):
       return a * inner_fn(a, b)
 
-    @function.defun
+    @def_function.function
     def almost_outer_fn(x):
       with backprop.GradientTape() as tp:
         tp.watch(x)
@@ -1178,7 +1178,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
       grad = tp.gradient(result, x)
       return grad
 
-    @function.defun
+    @def_function.function
     def outer_fn(x):
       return almost_outer_fn(x)
 
@@ -1188,19 +1188,19 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
 
   @test_util.run_in_graph_and_eager_modes
   def testDeeplyNestedDifferentiableFunctionGradientTapeInMultNestedDefun(self):
-    @function.defun
+    @def_function.function
     def inner_inner_fn(a, b):
       return math_ops.add(a, b)
 
-    @function.defun
+    @def_function.function
     def inner_fn(a, b):
       return inner_inner_fn(a, b)
 
-    @function.defun
+    @def_function.function
     def middle_fn(a, b):
       return a * inner_fn(a, b)
 
-    @function.defun
+    @def_function.function
     def almost_outer_fn(x):
       with backprop.GradientTape() as tp:
         tp.watch(x)
@@ -1208,11 +1208,11 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
       grad = tp.gradient(result, x)
       return grad
 
-    @function.defun
+    @def_function.function
     def outer_fn(x):
       return almost_outer_fn(x)
 
-    @function.defun
+    @def_function.function
     def outer_outer_fn(x):
       return outer_fn(x)
 
@@ -1222,19 +1222,19 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
 
   @test_util.run_in_graph_and_eager_modes
   def testDeeplyNestedDifferentiableFunctionTFGradientInDefun(self):
-    @function.defun
+    @def_function.function
     def inner_inner_fn(a, b):
       return math_ops.add(a, b)
 
-    @function.defun
+    @def_function.function
     def inner_fn(a, b):
       return inner_inner_fn(a, b)
 
-    @function.defun
+    @def_function.function
     def middle_fn(a, b):
       return a * inner_fn(a, b)
 
-    @function.defun
+    @def_function.function
     def outer_fn(x):
       result = middle_fn(x, 1.0)
       return gradients_impl.gradients(result, [x])[0]
@@ -1245,24 +1245,24 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
 
   @test_util.run_in_graph_and_eager_modes
   def testDeeplyNestedDifferentiableFunctionTFGradientInNestedDefun(self):
-    @function.defun
+    @def_function.function
     def inner_inner_fn(a, b):
       return math_ops.add(a, b)
 
-    @function.defun
+    @def_function.function
     def inner_fn(a, b):
       return inner_inner_fn(a, b)
 
-    @function.defun
+    @def_function.function
     def middle_fn(a, b):
       return a * inner_fn(a, b)
 
-    @function.defun
+    @def_function.function
     def almost_outer_fn(x):
       result = middle_fn(x, 1.0)
       return gradients_impl.gradients(result, [x])[0]
 
-    @function.defun
+    @def_function.function
     def outer_fn(x):
       return almost_outer_fn(x)
 
@@ -1272,28 +1272,28 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
 
   @test_util.run_in_graph_and_eager_modes
   def testDeeplyNestedDifferentiableFunctionTFGradientInMultNestedDefun(self):
-    @function.defun
+    @def_function.function
     def inner_inner_fn(a, b):
       return math_ops.add(a, b)
 
-    @function.defun
+    @def_function.function
     def inner_fn(a, b):
       return inner_inner_fn(a, b)
 
-    @function.defun
+    @def_function.function
     def middle_fn(a, b):
       return a * inner_fn(a, b)
 
-    @function.defun
+    @def_function.function
     def almost_outer_fn(x):
       result = middle_fn(x, 1.0)
       return gradients_impl.gradients(result, [x])[0]
 
-    @function.defun
+    @def_function.function
     def outer_fn(x):
       return almost_outer_fn(x)
 
-    @function.defun
+    @def_function.function
     def outer_outer_fn(x):
       return outer_fn(x)
 
@@ -1461,7 +1461,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     def add(a, b):
       return math_ops.add(a, b)
 
-    @function.defun
+    @def_function.function
     def add_one(x):
       return add(x, 1)
 
@@ -1570,7 +1570,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     if not context.executing_eagerly():
       self.evaluate(variables.global_variables_initializer())
 
-    self.assertAllEqual([[[[4.0]]]], self.evaluate(y))
+    self.assertAllClose([[[[4.0]]]], self.evaluate(y))
 
     # Remove reference cycles in model
     test_util.dismantle_polymorphic_function(model)
@@ -1675,7 +1675,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     with ops.device('gpu:0'):
       y = constant_op.constant(1.0)
 
-    @function.defun
+    @def_function.function
     def foo():
       return test_ops.device_placement_op()
 
@@ -2703,291 +2703,24 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     del m
     self.assertEqual([], list(weak_variables))
 
+  def testExecutorType(self):
+    @function.defun
+    def add_five(x):
+      return x + 5
 
-@parameterized.named_parameters(
-    dict(testcase_name='Defun', function_decorator=function.defun),
-    dict(testcase_name='DefFunction', function_decorator=def_function.function))
-class ArgumentNamingTests(test.TestCase, parameterized.TestCase):
-  """Tests for recognizable export signatures from concrete functions."""
-
-  def testBasic(self, function_decorator):
-    @function_decorator
-    def fn(a, b):
-      return a + b, a * b
-    # Call the function to make def_function happy
-    fn(array_ops.ones([]), array_ops.ones([]))
-
-    fn_op = fn.get_concrete_function(
-        tensor_spec.TensorSpec(shape=(None,), dtype=dtypes.float32),
-        tensor_spec.TensorSpec(shape=(), dtype=dtypes.float32))
     self.assertEqual(
-        ['a', 'b'],
-        [inp.op.name for inp in fn_op.inputs])
-    self.assertEqual(
-        [b'a', b'b'],
-        [inp.op.get_attr('_user_specified_name') for inp in fn_op.inputs])
-    self.assertEqual(2, len(fn_op.graph.structured_outputs))
-    self.assertAllClose(
-        [3., 2.],
-        fn_op(constant_op.constant(1.), constant_op.constant(2.)))
-    self.assertAllClose(
-        [3., 2.],
-        fn_op(a=constant_op.constant(1.), b=constant_op.constant(2.)))
+        5,
+        add_five(constant_op.constant(0, dtype=dtypes.int32)).numpy())
 
-  def testVariable(self, function_decorator):
-    @function_decorator
-    def fn(a, b):
-      return a + b, a * b
-    # Call the function to make def_function happy
-    fn(array_ops.ones([]), array_ops.ones([]))
+    with self.assertRaisesRegexp(errors.NotFoundError, 'NON_EXISTENT_EXECUTOR'):
+      with context.function_executor_type('NON_EXISTENT_EXECUTOR'):
+        add_five(constant_op.constant(0, dtype=dtypes.int32))
 
-    fn_op = fn.get_concrete_function(
-        tensor_spec.TensorSpec(shape=(None,), dtype=dtypes.float32),
-        variables.Variable(1.))
-    self.assertEqual(
-        ['a', 'b'],
-        [inp.op.name for inp in fn_op.inputs])
-    self.assertEqual(
-        [b'a', b'b'],
-        [inp.op.get_attr('_user_specified_name') for inp in fn_op.inputs])
-    self.assertEqual(2, len(fn_op.graph.structured_outputs))
-
-  def testDictReturned(self, function_decorator):
-    @function_decorator
-    def fn(x, z=(1., 2.), y=3.):
-      z1, z2 = z
-      return {'alpha': x + y + z1, 'beta': x * y + z2}
-    # Call the function to make def_function happy
-    fn(array_ops.ones([]))
-
-    fn_op = fn.get_concrete_function(
-        x=tensor_spec.TensorSpec(shape=(None,), dtype=dtypes.float32),
-        y=tensor_spec.TensorSpec(shape=(), dtype=dtypes.float32))
-    self.assertEqual(
-        ['x', 'y'],
-        [inp.op.name for inp in fn_op.inputs])
-    self.assertEqual(
-        [b'x', b'y'],
-        [inp.op.get_attr('_user_specified_name') for inp in fn_op.inputs])
-    self.assertEqual({'alpha', 'beta'},
-                     set(fn_op.graph.structured_outputs.keys()))
-
-    with self.assertRaisesRegexp(ValueError, "two arguments named 'z'"):
-      fn.get_concrete_function(
-          z=(tensor_spec.TensorSpec(shape=(None,), dtype=dtypes.float32),
-             tensor_spec.TensorSpec(shape=(), dtype=dtypes.float32)),
-          y=tensor_spec.TensorSpec(shape=(), dtype=dtypes.float32,
-                                   name='custom'),
-          x=4.)
-    fn_op2 = fn.get_concrete_function(
-        z=(tensor_spec.TensorSpec(shape=(None,), dtype=dtypes.float32,
-                                  name='z_first'),
-           tensor_spec.TensorSpec(shape=(), dtype=dtypes.float32,
-                                  name='z_second')),
-        y=tensor_spec.TensorSpec(shape=(), dtype=dtypes.float32, name='custom'),
-        x=4.)
-    self.assertEqual(
-        ['z_first', 'z_second', 'custom'],
-        [inp.op.name for inp in fn_op2.inputs])
-    self.assertEqual(
-        [b'z_first', b'z_second', b'custom'],
-        [inp.op.get_attr('_user_specified_name') for inp in fn_op2.inputs])
-
-    fn_op3 = fn.get_concrete_function(
-        tensor_spec.TensorSpec(shape=(), dtype=dtypes.float32, name='custom'),
-        z=(tensor_spec.TensorSpec(shape=(None,), dtype=dtypes.float32,
-                                  name='z1'),
-           tensor_spec.TensorSpec(shape=(), dtype=dtypes.float32, name='z2')),
-        y=tensor_spec.TensorSpec(shape=(), dtype=dtypes.float32))
-    self.assertEqual(
-        ['custom', 'z1', 'z2', 'y'],
-        [inp.op.name for inp in fn_op3.inputs])
-    self.assertEqual(
-        [b'custom', b'z1', b'z2', b'y'],
-        [inp.op.get_attr('_user_specified_name') for inp in fn_op3.inputs])
-
-  def testMethod(self, function_decorator):
-    class HasMethod(object):
-
-      @function_decorator
-      def method(self, x):
-        return x
-
-    has_method = HasMethod()
-    # Call the function to make def_function happy
-    HasMethod.method(has_method, array_ops.ones([]))
-    class_op = HasMethod.method.get_concrete_function(
-        has_method, tensor_spec.TensorSpec(shape=(), dtype=dtypes.float32))
-    self.assertEqual(
-        ['x'],
-        [inp.op.name for inp in class_op.inputs])
-    self.assertEqual(
-        [b'x'],
-        [inp.op.get_attr('_user_specified_name') for inp in class_op.inputs])
-    # Call the function to make def_function happy
-    has_method.method(array_ops.ones([]))
-    method_op = has_method.method.get_concrete_function(
-        tensor_spec.TensorSpec(shape=(), dtype=dtypes.float32))
-    self.assertEqual(
-        ['x'],
-        [inp.op.name for inp in method_op.inputs])
-    self.assertEqual(
-        [b'x'],
-        [inp.op.get_attr('_user_specified_name') for inp in method_op.inputs])
-    # TODO(allenl): It should be possible to override names when exporting. Do
-    # TensorSpec names need to go in cache keys? Or maybe get_concrete_function
-    # should always retrace?
-    self.skipTest('Not working')
-    method_op = has_method.method.get_concrete_function(
-        tensor_spec.TensorSpec(shape=(), dtype=dtypes.float32, name='y'))
-    self.assertEqual(
-        ['y'],
-        [inp.op.name for inp in method_op.inputs])
-    self.assertEqual(
-        [b'y'],
-        [inp.op.get_attr('_user_specified_name') for inp in method_op.inputs])
-
-  def testMethodSignature(self, function_decorator):
-
-    class HasMethod(object):
-
-      @function_decorator(
-          input_signature=(tensor_spec.TensorSpec(
-              shape=None, dtype=dtypes.float64, name='y'),))
-      def method(self, x):
-        hash(self)  # No weak proxies passed as `self`
-        return x
-
-    has_method = HasMethod()
-    # Call the function to make def_function happy
-    has_method.method(array_ops.ones([], dtype=dtypes.float64))
-    method_op = has_method.method.get_concrete_function()
-    self.assertEqual(
-        ['y'],
-        [inp.op.name for inp in method_op.inputs])
-    self.assertEqual(
-        [b'y'],
-        [inp.op.get_attr('_user_specified_name') for inp in method_op.inputs])
-    method_op2 = has_method.method.get_concrete_function()
-    self.assertEqual(
-        ['y'],
-        [inp.op.name for inp in method_op2.inputs])
-    self.assertEqual(
-        [b'y'],
-        [inp.op.get_attr('_user_specified_name') for inp in method_op2.inputs])
-
-  def testVariadic(self, function_decorator):
-    @function_decorator
-    def variadic_fn(x, *args, **kwargs):
-      return x + math_ops.add_n(list(args) + list(kwargs.values()))
-
-    # Call the function to make def_function happy
-    variadic_fn(array_ops.ones([]), array_ops.ones([]))
-    variadic_op = variadic_fn.get_concrete_function(
-        tensor_spec.TensorSpec(shape=(), dtype=dtypes.float32),
-        tensor_spec.TensorSpec(shape=None, dtype=dtypes.float32, name='y'),
-        tensor_spec.TensorSpec(shape=(), dtype=dtypes.float32),
-        tensor_spec.TensorSpec(shape=(), dtype=dtypes.float32,
-                               name='second_variadic'),
-        z=tensor_spec.TensorSpec(shape=(), dtype=dtypes.float32),
-        zz=tensor_spec.TensorSpec(shape=(), dtype=dtypes.float32, name='cust'))
-    self.assertEqual(
-        ['x', 'y', 'args', 'second_variadic', 'z', 'cust'],
-        [inp.op.name for inp in variadic_op.inputs])
-    self.assertEqual(
-        [b'x', b'y', b'args', b'second_variadic', b'z', b'cust'],
-        [inp.op.get_attr('_user_specified_name')
-         for inp in variadic_op.inputs])
-
-  def testVariadicInputSignature(self, function_decorator):
-    @function_decorator(
-        input_signature=(
-            tensor_spec.TensorSpec(shape=None, dtype=dtypes.float32),
-            tensor_spec.TensorSpec(shape=None, dtype=dtypes.float32, name='y'),
-            tensor_spec.TensorSpec(shape=(), dtype=dtypes.float32),
-            tensor_spec.TensorSpec(shape=(), dtype=dtypes.float32, name='z'),
-        ))
-    def variadic_fn(x, *args):
-      return x + math_ops.add_n(list(args))
-
-    # Call the function to make def_function happy
-    variadic_fn(array_ops.ones([]), array_ops.ones([]),
-                array_ops.ones([]), array_ops.ones([]))
-    variadic_op = variadic_fn.get_concrete_function()
-    self.assertIn(b'variadic_fn', variadic_op.name)
-    self.assertEqual(
-        ['x', 'y', 'args', 'z'],
-        [inp.op.name for inp in variadic_op.inputs])
-    self.assertEqual(
-        [b'x', b'y', b'args', b'z'],
-        [inp.op.get_attr('_user_specified_name')
-         for inp in variadic_op.inputs])
-
-
-class DefunCollectionTest(test.TestCase, parameterized.TestCase):
-
-  @parameterized.named_parameters(
-      dict(testcase_name='Defun', function_decorator=function.defun),
-      dict(
-          testcase_name='DefFunction',
-          function_decorator=def_function.function))
-  def testCollectionValueAccess(self, function_decorator):
-    """Read values from graph collections inside of defun."""
-    with ops.Graph().as_default() as g:
-      with self.session(graph=g):
-        x = 2
-        y = 5
-        ops.add_to_collection('x', x)
-        ops.add_to_collection('y', y)
-
-        @function_decorator
-        def fn():
-          x_const = constant_op.constant(ops.get_collection('x')[0])
-          y_const = constant_op.constant(ops.get_collection('y')[0])
-          z = math_ops.add(x_const, y_const)
-          ops.add_to_collection('z', 7)
-          return z
-
-        self.assertEqual(7, int(self.evaluate(fn())))
-        self.assertEquals(ops.get_collection('x'), [2])
-        self.assertEquals(ops.get_collection('y'), [5])
-        self.assertEquals(ops.get_collection('z'), [])
-
-  @parameterized.named_parameters(
-      dict(testcase_name='Defun', function_decorator=function.defun),
-      dict(
-          testcase_name='DefFunction',
-          function_decorator=def_function.function))
-  def testCollectionVariableValueAccess(self, function_decorator):
-    """Read variable value from graph collections inside of defun."""
-    with ops.Graph().as_default() as g:
-      with self.session(graph=g):
-        v = resource_variable_ops.ResourceVariable(1.0)
-
-        @function_decorator
-        def f():
-          return v.read_value()
-
-        self.evaluate(variables.global_variables_initializer())
-        self.assertEqual(1.0, float(self.evaluate(f())))
-        self.assertEquals(
-            len(ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)), 1)
-
-  def testCollectionVariableValueWrite(self):
-    """Write variable value inside defun."""
-    with ops.Graph().as_default() as g:
-      with self.session(graph=g):
-
-        @function.defun
-        def f():
-          v = resource_variable_ops.ResourceVariable(2.0)
-          return v
-
-        _ = f.get_concrete_function()
-        self.evaluate(variables.global_variables_initializer())
-        self.assertEqual(2.0, float(self.evaluate(f())))
-        self.assertEquals(
-            len(ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)), 1)
+    for executor_type in ('', 'DEFAULT', None):
+      with context.function_executor_type(executor_type):
+        self.assertAllEqual(
+            5,
+            add_five(constant_op.constant(0, dtype=dtypes.int32)).numpy())
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/eager/lift_to_graph.py b/tensorflow/python/eager/lift_to_graph.py
new file mode 100644
index 00000000000..c231264047b
--- /dev/null
+++ b/tensorflow/python/eager/lift_to_graph.py
@@ -0,0 +1,88 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# pylint: disable=unidiomatic-typecheck
+"""Utility to lift subgraphs."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+
+
+def _graph_inputs(op):
+  return [x.op for x in op.inputs] + list(op.control_inputs)
+
+
+def lift_to_graph(init_tensor, graph, sources=None):
+  """Copies the tensor and all its inputs recursively to the outer graph."""
+  # Check that the initializer does not depend on any placeholders.
+  if sources is None:
+    sources = set([])
+  visited_ops = set([x.op for x in sources])
+  ops_to_visit = [init_tensor.op]
+  op_outputs = collections.defaultdict(set)
+  while ops_to_visit:
+    op = ops_to_visit.pop()
+    if op in visited_ops:
+      continue
+    visited_ops.add(op)
+    # TODO(apassos) distinguish arg placeholders, capture placeholders,
+    # and placeholders the user might directly use to initialize
+    # variables.
+    if op.type == "Placeholder":
+      raise ValueError(
+          "Unable to lift tensor", init_tensor,
+          "because it depends transitively on placeholder ", op)
+    for inp in _graph_inputs(op):
+      op_outputs[inp].add(op)
+      if inp not in visited_ops and inp not in sources:
+        ops_to_visit.append(inp)
+  # Topologically sort the nodes we've extracted. Now we know how many of their
+  # outputs are part of this subgraph.
+  ops_to_copy = []
+  marked_ops = set([])
+  ops_to_visit = [init_tensor.op]
+  while ops_to_visit:
+    op = ops_to_visit.pop()
+    if op in marked_ops:
+      continue
+    marked_ops.add(op)
+    ops_to_copy.append(op)
+    for inp in _graph_inputs(op):
+      if all(x in marked_ops for x in op_outputs[inp]) and inp not in sources:
+        ops_to_visit.append(inp)
+  assert len(ops_to_copy) == len(visited_ops)
+  # ops_to_copy now holds a reverse topologically sorted list of ops which
+  # ends in the initializer. We copy those to the outermost graph and
+  # build the initialization op there.
+  with graph.as_default():
+    op_map = {}
+    for s in sources:
+      op_map[s] = array_ops.placeholder(dtype=s.dtype, shape=s.shape)
+    for op in reversed(ops_to_copy):
+      copied_inputs = [op_map[x] for x in op.inputs]
+      copied_control_inputs = [op_map[x] for x in op.control_inputs]
+      with ops.control_dependencies(copied_control_inputs):
+        copied_op = graph.create_op(
+            op.type, copied_inputs, [x.dtype for x in op.outputs],
+            attrs=op.node_def.attr)
+      op_map[op] = copied_op
+      for i, o in enumerate(op.outputs):
+        op_map[o] = copied_op.outputs[i]
+    return op_map
diff --git a/tensorflow/python/eager/pywrap_tensor.cc b/tensorflow/python/eager/pywrap_tensor.cc
index a2407854fd7..55f0896e3b4 100644
--- a/tensorflow/python/eager/pywrap_tensor.cc
+++ b/tensorflow/python/eager/pywrap_tensor.cc
@@ -420,9 +420,14 @@ int EagerTensor_init(EagerTensor* self, PyObject* args, PyObject* kwds) {
       if (TF_GetCode(self->status) != TF_OK) {
         PyErr_SetString(
             PyExc_TypeError,
-            tensorflow::strings::StrCat("Error while casting from DataType ",
-                                        handle_dtype, " to ", desired_dtype,
-                                        ". ", TF_Message(self->status))
+            tensorflow::strings::StrCat(
+                "Error while casting from DataType ",
+                tensorflow::DataTypeString(
+                    static_cast<tensorflow::DataType>(handle_dtype)),
+                " to ",
+                tensorflow::DataTypeString(
+                    static_cast<tensorflow::DataType>(desired_dtype)),
+                ". ", TF_Message(self->status))
                 .c_str());
         // Cleanup self->status before returning.
         TF_SetStatus(self->status, TF_OK, "");
@@ -435,7 +440,9 @@ int EagerTensor_init(EagerTensor* self, PyObject* args, PyObject* kwds) {
           PyExc_TypeError,
           tensorflow::strings::StrCat(
               "Cannot convert value ", TFE_GetPythonString(value_str.get()),
-              " to EagerTensor with requested dtype: ", desired_dtype)
+              " to EagerTensor with requested dtype: ",
+              tensorflow::DataTypeString(
+                  static_cast<tensorflow::DataType>(desired_dtype)))
               .c_str());
       return -1;
     }
diff --git a/tensorflow/python/eager/pywrap_tfe_src.cc b/tensorflow/python/eager/pywrap_tfe_src.cc
index 70de5e0c03e..6ca8eadbdeb 100644
--- a/tensorflow/python/eager/pywrap_tfe_src.cc
+++ b/tensorflow/python/eager/pywrap_tfe_src.cc
@@ -1853,7 +1853,7 @@ bool OpGradientDoesntRequireOutputIndices(
           {"Conv3DBackpropInputV2", {true, {}}},
           {"AvgPool3D", {true, {}}},
           {"AvgPool3DGrad", {true, {}}},
-          {"MaxPool3D", {true, {}}},
+          {"MaxPool3D", {false, {}}},
           {"MaxPool3DGrad", {true, {}}},
           {"MaxPool3DGradGrad", {true, {}}},
           {"BiasAdd", {true, {}}},
diff --git a/tensorflow/python/eager/pywrap_tfe_test.py b/tensorflow/python/eager/pywrap_tfe_test.py
index 6282a6c4595..669fa084888 100644
--- a/tensorflow/python/eager/pywrap_tfe_test.py
+++ b/tensorflow/python/eager/pywrap_tfe_test.py
@@ -22,7 +22,6 @@ from tensorflow.python import pywrap_tensorflow
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
 from tensorflow.python.eager import core
-from tensorflow.python.eager import tape
 from tensorflow.python.eager import test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -35,11 +34,6 @@ from tensorflow.python.ops import resource_variable_ops
 
 class Tests(test.TestCase):
 
-  def setUp(self):
-    # Force-load `distribution_strategy_context` to prevent GC at
-    # test time. See discussion in cl//219478951.
-    tape.distribution_strategy_context.get_distribution_strategy()
-
   @test_util.assert_no_new_tensors
   @test_util.assert_no_garbage_created
   def testFastpathExecute_MatMulCorrectResponse(self):
diff --git a/tensorflow/python/eager/tensor_test.py b/tensorflow/python/eager/tensor_test.py
index ddd46c167bf..f61d8478177 100644
--- a/tensorflow/python/eager/tensor_test.py
+++ b/tensorflow/python/eager/tensor_test.py
@@ -261,9 +261,8 @@ class TFETensorTest(test_util.TensorFlowTestCase):
 
   @test_util.run_in_graph_and_eager_modes
   def testCompatibility(self):
-    # TODO(nareshmodi): uint32, uint64 are not correctly handled in graph mode.
     integer_types = [dtypes.int8, dtypes.int16, dtypes.int32, dtypes.int64,
-                     dtypes.uint8, dtypes.uint16]
+                     dtypes.uint8, dtypes.uint16, dtypes.uint32, dtypes.uint64]
 
     # Floats are not compatible with ints
     for t in integer_types:
diff --git a/tensorflow/python/eager/wrap_function.py b/tensorflow/python/eager/wrap_function.py
index 7f9c896adee..48266437ef5 100644
--- a/tensorflow/python/eager/wrap_function.py
+++ b/tensorflow/python/eager/wrap_function.py
@@ -20,8 +20,12 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.eager import function
+from tensorflow.python.eager import lift_to_graph
 from tensorflow.python.framework import func_graph
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import variable_scope
+from tensorflow.python.util import nest
 
 
 class VariableHolder(object):
@@ -41,6 +45,38 @@ class VariableHolder(object):
       return self._fn(*args, **kwargs)
 
 
+class WrappedFunction(function.Function):
+  """Wraps a tf V1 piece of code in a function."""
+
+  def __init__(self, fn_graph, variable_holder, attrs=None, signature=None):
+    super(WrappedFunction, self).__init__(
+        fn_graph, attrs=attrs, signature=signature)
+    self._variable_holder = variable_holder
+
+  def prune(self, feeds, fetches):
+    flat_feeds, flat_fetches = nest.flatten(feeds), nest.flatten(fetches)
+    for f in flat_feeds + flat_fetches:
+      if not isinstance(f, ops.Tensor):
+        raise ValueError("Feeds and fetches must be tensors.")
+      if f.graph is not self._func_graph:
+        raise ValueError(
+            "Can only prune function whose feeds and fetches "
+            "are from this graph (%s). Tensor %s from graph %s" % (
+                self._func_graph, f, f.graph))
+    with self._func_graph.as_default():
+      pruned_graph = func_graph.FuncGraph("pruned")
+      sink_tensor = array_ops.identity_n(flat_fetches)[0]
+    lift_map = lift_to_graph.lift_to_graph(
+        sink_tensor, pruned_graph, sources=flat_feeds)
+    pruned_graph.outputs.extend(lift_map[x] for x in flat_fetches)
+    pruned_graph.inputs.extend(lift_map[x] for x in flat_feeds)
+    pruned_fn = WrappedFunction(
+        pruned_graph, variable_holder=self._variable_holder)
+    pruned_fn._num_positional_args = len(flat_feeds)  # pylint: disable=protected-access
+    pruned_fn._arg_keywords = []  # pylint: disable=protected-access
+    return pruned_fn
+
+
 def wrap_function(fn, signature, name=None):
   """Wraps the TF 1.x function fn into a graph function.
 
@@ -83,12 +119,11 @@ def wrap_function(fn, signature, name=None):
     the wrapped graph function.
   """
   holder = VariableHolder(fn)
-  fn = function.Function(
+  return WrappedFunction(
       func_graph.func_graph_from_py_func(
           name,
           holder,
           args=None, kwargs=None, signature=signature,
           add_control_dependencies=False),
+      variable_holder=holder,
       signature=signature)
-  fn._variable_holder = holder
-  return fn
diff --git a/tensorflow/python/eager/wrap_function_test.py b/tensorflow/python/eager/wrap_function_test.py
index 0690358491d..b32b6ca4269 100644
--- a/tensorflow/python/eager/wrap_function_test.py
+++ b/tensorflow/python/eager/wrap_function_test.py
@@ -53,6 +53,23 @@ class WrapFunctionTest(test.TestCase):
     self.assertAllEqual(f_sub(1.0), 4.0)
     self.assertAllEqual(f_sub(1.0), 3.0)
 
+  def testPrune(self):
+
+    x_in = []
+    x_out = []
+
+    def f(x, y):
+      x_in.append(x)
+      xx = x * x
+      x_out.append(xx)
+      return xx, 2 * y*y
+
+    f_wrapped = wrap_function.wrap_function(
+        f, [tensor_spec.TensorSpec((), dtypes.float32)] * 2)
+
+    f_pruned = f_wrapped.prune(x_in[0], [x_out[0]])
+    self.assertAllEqual(f_pruned(ops.convert_to_tensor(2.0)), [4.0])
+
 
 if __name__ == '__main__':
   ops.enable_eager_execution()
diff --git a/tensorflow/python/feature_column/BUILD b/tensorflow/python/feature_column/BUILD
index fe4fd21eaa4..d24a7ae80c8 100644
--- a/tensorflow/python/feature_column/BUILD
+++ b/tensorflow/python/feature_column/BUILD
@@ -133,6 +133,7 @@ py_test(
     name = "feature_column_v2_test",
     srcs = ["feature_column_v2_test.py"],
     data = [":vocabulary_testdata"],
+    shard_count = 5,
     srcs_version = "PY2AND3",
     tags = [
         "no_cuda_on_cpu_tap",
diff --git a/tensorflow/python/feature_column/feature_column.py b/tensorflow/python/feature_column/feature_column.py
index cb0a340c06a..b7a6a88535f 100644
--- a/tensorflow/python/feature_column/feature_column.py
+++ b/tensorflow/python/feature_column/feature_column.py
@@ -445,15 +445,16 @@ def linear_model(features,
             [0, 0]: "d"
             [1, 0]: "e"
             [1, 1]: "f"
-            [1, 2]: "g"
+            [1, 2]: "f"
         }
       ```
-      with `sparse_combiner` as "mean", the linear model outputs conceptly are:
+      with `sparse_combiner` as "mean", the linear model outputs consequently
+      are:
       ```
-        y_0 = 1.0 / 2.0 * ( w_a + w_ b) + w_c + b_0
-        y_1 = w_d + 1.0 / 3.0 * ( w_e + w_ f + w_g) + b_1
+        y_0 = 1.0 / 2.0 * ( w_a + w_b ) + w_d + b
+        y_1 = w_c + 1.0 / 3.0 * ( w_e + 2.0 * w_f ) + b
       ```
-      where `y_i` is the output, `b_i` is the bias, and `w_x` is the weight
+      where `y_i` is the output, `b` is the bias, and `w_x` is the weight
       assigned to the presence of `x` in the input features.
     weight_collections: A list of collection names to which the Variable will be
       added. Note that, variables will also be added to collections
diff --git a/tensorflow/python/feature_column/feature_column_v2.py b/tensorflow/python/feature_column/feature_column_v2.py
index d97d41dd830..9b4a7e882f9 100644
--- a/tensorflow/python/feature_column/feature_column_v2.py
+++ b/tensorflow/python/feature_column/feature_column_v2.py
@@ -141,7 +141,11 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor as sparse_tensor_lib
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.keras.engine import training
 from tensorflow.python.keras.engine.base_layer import Layer
+# TODO(b/118385027): Dependency on keras can be problematic if Keras moves out
+# of the main repo.
+from tensorflow.python.keras import utils
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
@@ -184,6 +188,7 @@ class StateManager(object):
                       shape,
                       dtype=None,
                       trainable=True,
+                      use_resource=True,
                       initializer=None):
     """Creates a new variable.
 
@@ -193,12 +198,14 @@ class StateManager(object):
       shape: variable shape.
       dtype: The type of the variable. Defaults to `self.dtype` or `float32`.
       trainable: Whether this variable is trainable or not.
+      use_resource: If true, we use resource variables. Otherwise we use
+        RefVariable.
       initializer: initializer instance (callable).
 
     Returns:
       The created variable.
     """
-    del feature_column, name, shape, dtype, trainable, initializer
+    del feature_column, name, shape, dtype, trainable, use_resource, initializer
     raise NotImplementedError('StateManager.create_variable')
 
   def add_variable(self, feature_column, var):
@@ -270,6 +277,7 @@ class _StateManagerImpl(StateManager):
                       shape,
                       dtype=None,
                       trainable=True,
+                      use_resource=True,
                       initializer=None):
     if name in self._cols_to_vars_map[feature_column]:
       raise ValueError('Variable already exists.')
@@ -280,7 +288,7 @@ class _StateManagerImpl(StateManager):
         dtype=dtype,
         initializer=initializer,
         trainable=self._trainable and trainable,
-        use_resource=True,
+        use_resource=use_resource,
         # TODO(rohanj): Get rid of this hack once we have a mechanism for
         # specifying a default partitioner for an entire layer. In that case,
         # the default getter for Layers should work.
@@ -412,7 +420,102 @@ class FeatureLayer(Layer):
     return (input_shape[0], total_elements)
 
 
-class LinearModel(Layer):
+class _LinearModelLayer(Layer):
+  """Layer that contains logic for `LinearModel`."""
+
+  def __init__(self,
+               feature_columns,
+               units=1,
+               sparse_combiner='sum',
+               trainable=True,
+               name=None,
+               **kwargs):
+    super(_LinearModelLayer, self).__init__(
+        name=name, trainable=trainable, **kwargs)
+
+    self._feature_columns = _normalize_feature_columns(feature_columns)
+    self._feature_columns = sorted(self._feature_columns, key=lambda x: x.name)
+    for column in self._feature_columns:
+      if not isinstance(column, (DenseColumn, CategoricalColumn)):
+        raise ValueError(
+            'Items of feature_columns must be either a '
+            'DenseColumn or CategoricalColumn. Given: {}'.format(column))
+
+    self._units = units
+    self._sparse_combiner = sparse_combiner
+
+    self._state_manager = _StateManagerImpl(self, self.trainable)
+    self.bias = None
+
+  def build(self, _):
+    # We need variable scopes for now because we want the variable partitioning
+    # information to percolate down. We also use _pure_variable_scope's here
+    # since we want to open up a name_scope in the `call` method while creating
+    # the ops.
+    with variable_scope._pure_variable_scope(self.name):  # pylint: disable=protected-access
+      for column in self._feature_columns:
+        with variable_scope._pure_variable_scope(column.name):  # pylint: disable=protected-access
+          # Create the state for each feature column
+          column.create_state(self._state_manager)
+
+          # Create a weight variable for each column.
+          if isinstance(column, CategoricalColumn):
+            first_dim = column.num_buckets
+          else:
+            first_dim = column.variable_shape.num_elements()
+          self._state_manager.create_variable(
+              column,
+              name='weights',
+              dtype=dtypes.float32,
+              shape=(first_dim, self._units),
+              initializer=init_ops.zeros_initializer(),
+              trainable=self.trainable)
+
+      # Create a bias variable.
+      self.bias = self.add_variable(
+          name='bias_weights',
+          dtype=dtypes.float32,
+          shape=[self._units],
+          initializer=init_ops.zeros_initializer(),
+          trainable=self.trainable,
+          use_resource=True,
+          # TODO(rohanj): Get rid of this hack once we have a mechanism for
+          # specifying a default partitioner for an entire layer. In that case,
+          # the default getter for Layers should work.
+          getter=variable_scope.get_variable)
+
+    super(_LinearModelLayer, self).build(None)
+
+  def call(self, features):
+    if not isinstance(features, dict):
+      raise ValueError('We expected a dictionary here. Instead we got: {}'
+                       .format(features))
+    with ops.name_scope(self.name):
+      transformation_cache = FeatureTransformationCache(features)
+      weighted_sums = []
+      for column in self._feature_columns:
+        with ops.name_scope(column.name):
+          # All the weights used in the linear model are owned by the state
+          # manager associated with this Linear Model.
+          weight_var = self._state_manager.get_variable(column, 'weights')
+
+          weighted_sum = _create_weighted_sum(
+              column=column,
+              transformation_cache=transformation_cache,
+              state_manager=self._state_manager,
+              sparse_combiner=self._sparse_combiner,
+              weight_var=weight_var)
+          weighted_sums.append(weighted_sum)
+
+      _verify_static_batch_size_equality(weighted_sums, self._feature_columns)
+      predictions_no_bias = math_ops.add_n(
+          weighted_sums, name='weighted_sum_no_bias')
+      predictions = nn_ops.bias_add(
+          predictions_no_bias, self.bias, name='weighted_sum')
+      return predictions
+
+
+class LinearModel(training.Model):
   """Produces a linear prediction `Tensor` based on given `feature_columns`.
 
   This layer generates a weighted sum based on output dimension `units`.
@@ -514,60 +617,15 @@ class LinearModel(Layer):
       ValueError: if an item in `feature_columns` is neither a `DenseColumn`
         nor `CategoricalColumn`.
     """
-    super(LinearModel, self).__init__(name=name, trainable=trainable, **kwargs)
 
-    self._feature_columns = _normalize_feature_columns(feature_columns)
-    self._feature_columns = sorted(self._feature_columns, key=lambda x: x.name)
-    for column in self._feature_columns:
-      if not isinstance(column, (DenseColumn, CategoricalColumn)):
-        raise ValueError(
-            'Items of feature_columns must be either a '
-            'DenseColumn or CategoricalColumn. Given: {}'.format(column))
-
-    self._units = units
-    self._sparse_combiner = sparse_combiner
-
-    self._state_manager = _StateManagerImpl(self, self.trainable)
-    self._bias_variable = None
-
-  def build(self, _):
-    # We need variable scopes for now because we want the variable partitioning
-    # information to percolate down. We also use _pure_variable_scope's here
-    # since we want to open up a name_scope in the `call` method while creating
-    # the ops.
-    with variable_scope._pure_variable_scope(self.name):  # pylint: disable=protected-access
-      for column in self._feature_columns:
-        with variable_scope._pure_variable_scope(column.name):  # pylint: disable=protected-access
-          # Create the state for each feature column
-          column.create_state(self._state_manager)
-
-          # Create a weight variable for each column.
-          if isinstance(column, CategoricalColumn):
-            first_dim = column.num_buckets
-          else:
-            first_dim = column.variable_shape.num_elements()
-          self._state_manager.create_variable(
-              column,
-              name='weights',
-              dtype=dtypes.float32,
-              shape=(first_dim, self._units),
-              initializer=init_ops.zeros_initializer(),
-              trainable=self.trainable)
-
-      # Create a bias variable.
-      self._bias_variable = self.add_variable(
-          name='bias_weights',
-          dtype=dtypes.float32,
-          shape=[self._units],
-          initializer=init_ops.zeros_initializer(),
-          trainable=self.trainable,
-          use_resource=True,
-          # TODO(rohanj): Get rid of this hack once we have a mechanism for
-          # specifying a default partitioner for an entire layer. In that case,
-          # the default getter for Layers should work.
-          getter=variable_scope.get_variable)
-
-    super(LinearModel, self).build(None)
+    super(LinearModel, self).__init__(name=name, **kwargs)
+    self.layer = _LinearModelLayer(
+        feature_columns,
+        units,
+        sparse_combiner,
+        trainable,
+        name=self.name,
+        **kwargs)
 
   def call(self, features):
     """Returns a `Tensor` the represents the predictions of a linear model.
@@ -585,36 +643,11 @@ class LinearModel(Layer):
     Raises:
       ValueError: If features are not a dictionary.
     """
-    if not isinstance(features, dict):
-      raise ValueError('We expected a dictionary here. Instead we got: ',
-                       features)
-    with ops.name_scope(self.name):
-      transformation_cache = FeatureTransformationCache(features)
-      weighted_sums = []
-      for column in self._feature_columns:
-        with ops.name_scope(column.name):
-          # All the weights used in the linear model are owned by the state
-          # manager associated with this Linear Model.
-          weight_var = self._state_manager.get_variable(column, 'weights')
-
-          weighted_sum = _create_weighted_sum(
-              column=column,
-              transformation_cache=transformation_cache,
-              state_manager=self._state_manager,
-              sparse_combiner=self._sparse_combiner,
-              weight_var=weight_var)
-          weighted_sums.append(weighted_sum)
-
-      _verify_static_batch_size_equality(weighted_sums, self._feature_columns)
-      predictions_no_bias = math_ops.add_n(
-          weighted_sums, name='weighted_sum_no_bias')
-      predictions = nn_ops.bias_add(
-          predictions_no_bias, self._bias_variable, name='weighted_sum')
-      return predictions
+    return self.layer(features)
 
   @property
-  def bias_variable(self):
-    return self._bias_variable
+  def bias(self):
+    return self.layer.bias
 
 
 def _transform_features(features, feature_columns, state_manager):
@@ -1839,6 +1872,101 @@ class FeatureColumn(object):
     """
     pass
 
+  @abc.abstractproperty
+  def parents(self):
+    """Returns a list of immediate raw feature and FeatureColumn dependencies.
+
+    For example:
+    # For the following feature columns
+    a = numeric_column('f1')
+    c = crossed_column(a, 'f2')
+    # The expected parents are:
+    a.parents = ['f1']
+    c.parents = [a, 'f2']
+    """
+    pass
+
+  @abc.abstractmethod
+  def _get_config(self):
+    """Returns the config of the feature column.
+
+    A FeatureColumn config is a Python dictionary (serializable) containing the
+    configuration of a FeatureColumn. The same FeatureColumn can be
+    reinstantiated later from this configuration.
+
+    The config of a feature column does not include information about feature
+    columns depending on it nor the FeatureColumn class name.
+
+    Example with (de)serialization practices followed in this file:
+    ```python
+    class SerializationExampleFeatureColumn(
+        FeatureColumn, collections.namedtuple(
+            'SerializationExampleFeatureColumn',
+            ('dimension', 'parent', 'dtype', 'normalizer_fn'))):
+
+      def _get_config(self):
+        # Create a dict from the namedtuple.
+        # Python attribute literals can be directly copied from / to the config.
+        # For example 'dimension', assuming it is an integer literal.
+        config = dict(zip(self._fields, self))
+
+        # (De)serialization of parent FeatureColumns should use the provided
+        # (de)serialize_feature_column() methods that take care of de-duping.
+        config['parent'] = serialize_feature_column(self.parent)
+
+        # Many objects provide custom (de)serialization e.g: for tf.DType
+        # tf.DType.name, tf.as_dtype() can be used.
+        config['dtype'] = self.dtype.name
+
+        # Non-trivial dependencies should be Keras-(de)serializable.
+        config['normalizer_fn'] = utils.serialize_keras_object(
+            self.normalizer_fn)
+
+        return config
+
+      @classmethod
+      def _from_config(cls, config, custom_objects=None, columns_by_name=None):
+        # This should do the inverse transform from `_get_config` and construct
+        # the namedtuple.
+        kwargs = config.copy()
+        kwargs['parent'] = deserialize_feature_column(
+            config['parent'], custom_objects, columns_by_name)
+        kwargs['dtype'] = dtypes.as_dtype(config['dtype'])
+        kwargs['normalizer_fn'] = utils.deserialize_keras_object(
+          config['normalizer_fn'], custom_objects=custom_objects)
+        return cls(**kwargs)
+
+    ```
+    Returns:
+      A serializable Dict that can be used to deserialize the object with
+      from_config.
+    """
+    pass
+
+  @classmethod
+  def _from_config(cls, config, custom_objects=None, columns_by_name=None):
+    """Creates a FeatureColumn from its config.
+
+    This method should be the reverse of `_get_config`, capable of instantiating
+    the same FeatureColumn from the config dictionary. See `_get_config` for an
+    example of common (de)serialization practices followed in this file.
+
+    TODO(b/118939620): This is a private method until consensus is reached on
+    supporting object deserialization deduping within Keras.
+
+    Args:
+      config: A Dict config acquired with `_get_config`.
+      custom_objects: Optional dictionary mapping names (strings) to custom
+        classes or functions to be considered during deserialization.
+      columns_by_name: A Dict[String, FeatureColumn] of existing columns in
+        order to avoid duplication. Should be passed to any calls to
+        deserialize_feature_column().
+
+    Returns:
+      A FeatureColumn for the input config.
+    """
+    pass
+
 
 class DenseColumn(FeatureColumn):
   """Represents a column which can be represented as `Tensor`.
@@ -2345,6 +2473,32 @@ class NumericColumn(
     del trainable
     return inputs.get(self)
 
+  @property
+  def parents(self):
+    """See 'FeatureColumn` base class."""
+    return [self.key]
+
+  def _get_config(self):
+    """See 'FeatureColumn` base class."""
+    config = dict(zip(self._fields, self))
+    config['normalizer_fn'] = utils.serialize_keras_object(self.normalizer_fn)
+    config['dtype'] = self.dtype.name
+    return config
+
+  @classmethod
+  def _from_config(cls, config, custom_objects=None, columns_by_name=None):
+    """See 'FeatureColumn` base class."""
+    _check_config_keys(config, cls._fields)
+    kwargs = config.copy()
+    # TODO(b/118820158): Simplify if deserialize_keras_object supports None.
+    if config['normalizer_fn']:
+      kwargs['normalizer_fn'] = utils.deserialize_keras_object(
+          config['normalizer_fn'], custom_objects=custom_objects)
+    else:
+      kwargs['normalizer_fn'] = None
+    kwargs['dtype'] = dtypes.as_dtype(config['dtype'])
+    return cls(**kwargs)
+
 
 class BucketizedColumn(
     DenseColumn,
@@ -2477,6 +2631,26 @@ class BucketizedColumn(
     input_tensor = inputs.get(self)
     return self._get_sparse_tensors_for_input_tensor(input_tensor)
 
+  @property
+  def parents(self):
+    """See 'FeatureColumn` base class."""
+    return [self.source_column]
+
+  def _get_config(self):
+    """See 'FeatureColumn` base class."""
+    config = dict(zip(self._fields, self))
+    config['source_column'] = serialize_feature_column(self.source_column)
+    return config
+
+  @classmethod
+  def _from_config(cls, config, custom_objects=None, columns_by_name=None):
+    """See 'FeatureColumn` base class."""
+    _check_config_keys(config, cls._fields)
+    kwargs = config.copy()
+    kwargs['source_column'] = deserialize_feature_column(
+        config['source_column'], custom_objects, columns_by_name)
+    return cls(**kwargs)
+
 
 class EmbeddingColumn(
     DenseColumn,
@@ -2539,6 +2713,8 @@ class EmbeddingColumn(
         shape=embedding_shape,
         dtype=dtypes.float32,
         trainable=self.trainable,
+        # TODO(rohanj): Make this True when b/118500434 is fixed.
+        use_resource=False,
         initializer=self.initializer)
 
   def _get_dense_tensor_internal_helper(self, sparse_tensors,
@@ -2682,6 +2858,34 @@ class EmbeddingColumn(
     return SequenceDenseColumn.TensorSequenceLengthPair(
         dense_tensor=dense_tensor, sequence_length=sequence_length)
 
+  @property
+  def parents(self):
+    """See 'FeatureColumn` base class."""
+    return [self.categorical_column]
+
+  def _get_config(self):
+    """See 'FeatureColumn` base class."""
+    config = dict(zip(self._fields, self))
+    config['categorical_column'] = serialize_feature_column(
+        self.categorical_column)
+    config['initializer'] = utils.serialize_keras_object(self.initializer)
+    return config
+
+  @classmethod
+  def _from_config(cls, config, custom_objects=None, columns_by_name=None):
+    """See 'FeatureColumn` base class."""
+    _check_config_keys(config, cls._fields)
+    kwargs = config.copy()
+    kwargs['categorical_column'] = deserialize_feature_column(
+        config['categorical_column'], custom_objects, columns_by_name)
+    # TODO(b/118820158): Simplify if deserialize_keras_object supports None.
+    if config['initializer']:
+      kwargs['initializer'] = utils.deserialize_keras_object(
+          config['initializer'], custom_objects=custom_objects)
+    else:
+      kwargs['initializer'] = None
+    return cls(**kwargs)
+
 
 def _raise_shared_embedding_column_error():
   raise ValueError('SharedEmbeddingColumns are not supported in '
@@ -2848,6 +3052,20 @@ class SharedEmbeddingColumn(
                                  trainable=None):
     return _raise_shared_embedding_column_error()
 
+  @property
+  def parents(self):
+    """See 'FeatureColumn` base class."""
+    return [self.categorical_column]
+
+  def _get_config(self):
+    """See 'FeatureColumn` base class."""
+    raise NotImplementedError()
+
+  @classmethod
+  def _from_config(cls, config, custom_objects=None, columns_by_name=None):
+    """See 'FeatureColumn` base class."""
+    raise NotImplementedError()
+
 
 def _create_tuple(shape, value):
   """Returns a tuple with given shape and filled with value."""
@@ -3042,6 +3260,25 @@ class HashedCategoricalColumn(
     del trainable
     return CategoricalColumn.IdWeightPair(inputs.get(self), None)
 
+  @property
+  def parents(self):
+    """See 'FeatureColumn` base class."""
+    return [self.key]
+
+  def _get_config(self):
+    """See 'FeatureColumn` base class."""
+    config = dict(zip(self._fields, self))
+    config['dtype'] = self.dtype.name
+    return config
+
+  @classmethod
+  def _from_config(cls, config, custom_objects=None, columns_by_name=None):
+    """See 'FeatureColumn` base class."""
+    _check_config_keys(config, cls._fields)
+    kwargs = config.copy()
+    kwargs['dtype'] = dtypes.as_dtype(config['dtype'])
+    return cls(**kwargs)
+
 
 class VocabularyFileCategoricalColumn(
     CategoricalColumn,
@@ -3134,6 +3371,25 @@ class VocabularyFileCategoricalColumn(
     del trainable
     return CategoricalColumn.IdWeightPair(inputs.get(self), None)
 
+  @property
+  def parents(self):
+    """See 'FeatureColumn` base class."""
+    return [self.key]
+
+  def _get_config(self):
+    """See 'FeatureColumn` base class."""
+    config = dict(zip(self._fields, self))
+    config['dtype'] = self.dtype.name
+    return config
+
+  @classmethod
+  def _from_config(cls, config, custom_objects=None, columns_by_name=None):
+    """See 'FeatureColumn` base class."""
+    _check_config_keys(config, cls._fields)
+    kwargs = config.copy()
+    kwargs['dtype'] = dtypes.as_dtype(config['dtype'])
+    return cls(**kwargs)
+
 
 class VocabularyListCategoricalColumn(
     CategoricalColumn,
@@ -3226,6 +3482,25 @@ class VocabularyListCategoricalColumn(
     del trainable
     return CategoricalColumn.IdWeightPair(inputs.get(self), None)
 
+  @property
+  def parents(self):
+    """See 'FeatureColumn` base class."""
+    return [self.key]
+
+  def _get_config(self):
+    """See 'FeatureColumn` base class."""
+    config = dict(zip(self._fields, self))
+    config['dtype'] = self.dtype.name
+    return config
+
+  @classmethod
+  def _from_config(cls, config, custom_objects=None, columns_by_name=None):
+    """See 'FeatureColumn` base class."""
+    _check_config_keys(config, cls._fields)
+    kwargs = config.copy()
+    kwargs['dtype'] = dtypes.as_dtype(config['dtype'])
+    return cls(**kwargs)
+
 
 class IdentityCategoricalColumn(
     CategoricalColumn,
@@ -3327,6 +3602,21 @@ class IdentityCategoricalColumn(
     del trainable
     return CategoricalColumn.IdWeightPair(inputs.get(self), None)
 
+  @property
+  def parents(self):
+    """See 'FeatureColumn` base class."""
+    return [self.key]
+
+  def _get_config(self):
+    """See 'FeatureColumn` base class."""
+    return dict(zip(self._fields, self))
+
+  @classmethod
+  def _from_config(cls, config, custom_objects=None, columns_by_name=None):
+    """See 'FeatureColumn` base class."""
+    _check_config_keys(config, cls._fields)
+    return cls(**config)
+
 
 class WeightedCategoricalColumn(
     CategoricalColumn,
@@ -3425,6 +3715,29 @@ class WeightedCategoricalColumn(
     tensors = inputs.get(self)
     return CategoricalColumn.IdWeightPair(tensors[0], tensors[1])
 
+  @property
+  def parents(self):
+    """See 'FeatureColumn` base class."""
+    return [self.categorical_column, self.weight_feature_key]
+
+  def _get_config(self):
+    """See 'FeatureColumn` base class."""
+    config = dict(zip(self._fields, self))
+    config['categorical_column'] = serialize_feature_column(
+        self.categorical_column)
+    config['dtype'] = self.dtype.name
+    return config
+
+  @classmethod
+  def _from_config(cls, config, custom_objects=None, columns_by_name=None):
+    """See 'FeatureColumn` base class."""
+    _check_config_keys(config, cls._fields)
+    kwargs = config.copy()
+    kwargs['categorical_column'] = deserialize_feature_column(
+        config['categorical_column'], custom_objects, columns_by_name)
+    kwargs['dtype'] = dtypes.as_dtype(config['dtype'])
+    return cls(**kwargs)
+
 
 class CrossedColumn(
     CategoricalColumn,
@@ -3544,6 +3857,28 @@ class CrossedColumn(
     del trainable
     return CategoricalColumn.IdWeightPair(inputs.get(self), None)
 
+  @property
+  def parents(self):
+    """See 'FeatureColumn` base class."""
+    return list(self.keys)
+
+  def _get_config(self):
+    """See 'FeatureColumn` base class."""
+    config = dict(zip(self._fields, self))
+    config['keys'] = tuple([serialize_feature_column(fc) for fc in self.keys])
+    return config
+
+  @classmethod
+  def _from_config(cls, config, custom_objects=None, columns_by_name=None):
+    """See 'FeatureColumn` base class."""
+    _check_config_keys(config, cls._fields)
+    kwargs = config.copy()
+    kwargs['keys'] = tuple([
+        deserialize_feature_column(c, custom_objects, columns_by_name)
+        for c in config['keys']
+    ])
+    return cls(**kwargs)
+
 
 def _collect_leaf_level_keys(cross):
   """Collects base keys by expanding all nested crosses.
@@ -3922,6 +4257,27 @@ class IndicatorColumn(
     return SequenceDenseColumn.TensorSequenceLengthPair(
         dense_tensor=dense_tensor, sequence_length=sequence_length)
 
+  @property
+  def parents(self):
+    """See 'FeatureColumn` base class."""
+    return [self.categorical_column]
+
+  def _get_config(self):
+    """See 'FeatureColumn` base class."""
+    config = dict(zip(self._fields, self))
+    config['categorical_column'] = serialize_feature_column(
+        self.categorical_column)
+    return config
+
+  @classmethod
+  def _from_config(cls, config, custom_objects=None, columns_by_name=None):
+    """See 'FeatureColumn` base class."""
+    _check_config_keys(config, cls._fields)
+    kwargs = config.copy()
+    kwargs['categorical_column'] = deserialize_feature_column(
+        config['categorical_column'], custom_objects, columns_by_name)
+    return cls(**kwargs)
+
 
 def _verify_static_batch_size_equality(tensors, columns):
   """Verify equality between static batch sizes.
@@ -4044,3 +4400,186 @@ class SequenceCategoricalColumn(
                           trainable=None):
     sparse_tensors = self.categorical_column._get_sparse_tensors(inputs)  # pylint: disable=protected-access
     return self._get_sparse_tensors_helper(sparse_tensors)
+
+  @property
+  def parents(self):
+    """See 'FeatureColumn` base class."""
+    return [self.categorical_column]
+
+  def _get_config(self):
+    """See 'FeatureColumn` base class."""
+    config = dict(zip(self._fields, self))
+    config['categorical_column'] = serialize_feature_column(
+        self.categorical_column)
+    return config
+
+  @classmethod
+  def _from_config(cls, config, custom_objects=None, columns_by_name=None):
+    """See 'FeatureColumn` base class."""
+    _check_config_keys(config, cls._fields)
+    kwargs = config.copy()
+    kwargs['categorical_column'] = deserialize_feature_column(
+        config['categorical_column'], custom_objects, columns_by_name)
+    return cls(**kwargs)
+
+
+# FeatureColumn serialization, deserialization logic.
+
+
+def _check_config_keys(config, expected_keys):
+  """Checks that a config has all expected_keys."""
+  if set(config.keys()) != set(expected_keys):
+    raise ValueError('Invalid config: {}, expected keys: {}'.format(
+        config, expected_keys))
+
+
+def serialize_feature_column(fc):
+  """Serializes a FeatureColumn or a raw string key.
+
+  This method should only be used to serialize parent FeatureColumns when
+  implementing FeatureColumn._get_config(), else serialize_feature_columns()
+  is preferable.
+
+  This serialization also keeps information of the FeatureColumn class, so
+  deserialization is possible without knowing the class type. For example:
+
+  a = numeric_column('x')
+  a._get_config() gives:
+  {
+      'key': 'price',
+      'shape': (1,),
+      'default_value': None,
+      'dtype': 'float32',
+      'normalizer_fn': None
+  }
+  While serialize_feature_column(a) gives:
+  {
+      'class_name': 'NumericColumn',
+      'config': {
+          'key': 'price',
+          'shape': (1,),
+          'default_value': None,
+          'dtype': 'float32',
+          'normalizer_fn': None
+      }
+  }
+
+  Args:
+    fc: A FeatureColumn or raw feature key string.
+
+  Returns:
+    Keras serialization for FeatureColumns, leaves string keys unaffected.
+
+  Raises:
+    ValueError if called with input that is not string or FeatureColumn.
+  """
+  if isinstance(fc, six.string_types):
+    return fc
+  elif isinstance(fc, FeatureColumn):
+    return utils.serialize_keras_class_and_config(fc.__class__.__name__,
+                                                  fc._get_config())
+  else:
+    raise ValueError('Instance: {} is not a FeatureColumn'.format(fc))
+
+
+def deserialize_feature_column(config,
+                               custom_objects=None,
+                               columns_by_name=None):
+  """Deserializes a `config` generated with `serialize_feature_column`.
+
+  This method should only be used to deserialize parent FeatureColumns when
+  implementing FeatureColumn._from_config(), else deserialize_feature_columns()
+  is preferable. Returns a FeatureColumn for this config.
+  TODO(b/118939620): Simplify code if Keras utils support object deduping.
+
+  Args:
+    config: A Dict with the serialization of feature columns acquired by
+      `serialize_feature_column`, or a string representing a raw column.
+    custom_objects: A Dict from custom_object name to the associated keras
+      serializable objects (FeatureColumns, classes or functions).
+    columns_by_name: A Dict[String, FeatureColumn] of existing columns in order
+      to avoid duplication.
+
+  Raises:
+    ValueError if `config` has invalid format (e.g: expected keys missing,
+    or refers to unknown classes).
+
+  Returns:
+    A FeatureColumn corresponding to the input `config`.
+  """
+  if isinstance(config, six.string_types):
+    return config
+  # A dict from class_name to class for all FeatureColumns in this module.
+  # FeatureColumns not part of the module can be passed as custom_objects.
+  module_feature_column_classes = {
+      cls.__name__: cls for cls in [
+          BucketizedColumn, EmbeddingColumn, HashedCategoricalColumn,
+          IdentityCategoricalColumn, IndicatorColumn, NumericColumn,
+          SequenceCategoricalColumn, SequenceDenseColumn, SharedEmbeddingColumn,
+          VocabularyFileCategoricalColumn, VocabularyListCategoricalColumn,
+          WeightedCategoricalColumn
+      ]
+  }
+  if columns_by_name is None:
+    columns_by_name = {}
+
+  (cls, cls_config) = utils.class_and_config_for_serialized_keras_object(
+      config,
+      module_objects=module_feature_column_classes,
+      custom_objects=custom_objects,
+      printable_module_name='feature_column_v2')
+
+  if not issubclass(cls, FeatureColumn):
+    raise ValueError(
+        'Expected FeatureColumn class, instead found: {}'.format(cls))
+
+  # Always deserialize the FeatureColumn, in order to get the name.
+  new_instance = cls._from_config(cls_config, columns_by_name=columns_by_name)  # pylint: disable=protected-access
+
+  # If the name already exists, re-use the column from columns_by_name,
+  # (new_instance remains unused).
+  return columns_by_name.setdefault(new_instance.name, new_instance)
+
+
+def serialize_feature_columns(feature_columns):
+  """Serializes a list of FeatureColumns.
+
+  Returns a list of Keras-style config dicts that represent the input
+  FeatureColumns and can be used with `deserialize_feature_columns` for
+  reconstructing the original columns.
+
+  Args:
+    feature_columns: A list of FeatureColumns.
+
+  Returns:
+    Keras serialization for the list of FeatureColumns.
+
+  Raises:
+    ValueError if called with input that is not a list of FeatureColumns.
+  """
+  return [serialize_feature_column(fc) for fc in feature_columns]
+
+
+def deserialize_feature_columns(configs, custom_objects=None):
+  """Deserializes a list of FeatureColumns configs.
+
+  Returns a list of FeatureColumns given a list of config dicts acquired by
+  `serialize_feature_columns`.
+
+  Args:
+    configs: A list of Dicts with the serialization of feature columns acquired
+      by `serialize_feature_columns`.
+    custom_objects: A Dict from custom_object name to the associated keras
+      serializable objects (FeatureColumns, classes or functions).
+
+  Returns:
+    FeatureColumn objects corresponding to the input configs.
+
+  Raises:
+    ValueError if called with input that is not a list of FeatureColumns.
+  """
+  columns_by_name = {}
+  return [
+      deserialize_feature_column(c, custom_objects, columns_by_name)
+      for c in configs
+  ]
diff --git a/tensorflow/python/feature_column/feature_column_v2_test.py b/tensorflow/python/feature_column/feature_column_v2_test.py
index ab727752b49..a26b8600568 100644
--- a/tensorflow/python/feature_column/feature_column_v2_test.py
+++ b/tensorflow/python/feature_column/feature_column_v2_test.py
@@ -27,6 +27,7 @@ from tensorflow.core.example import example_pb2
 from tensorflow.core.example import feature_pb2
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.core.protobuf import rewriter_config_pb2
+from tensorflow.python import keras
 from tensorflow.python.client import session
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
@@ -48,6 +49,7 @@ from tensorflow.python.ops import variables as variables_lib
 from tensorflow.python.platform import test
 from tensorflow.python.training import coordinator
 from tensorflow.python.training import queue_runner_impl
+from tensorflow.python.training import rmsprop
 
 
 def _initialized_session(config=None):
@@ -67,11 +69,30 @@ def get_linear_model_column_var(column, name='linear_model'):
                             name + '/' + column.name)[0]
 
 
+class BaseFeatureColumnForTests(fc.FeatureColumn):
+  """A base FeatureColumn useful to avoid boiler-plate in tests.
+
+  Provides dummy implementations for abstract methods that raise ValueError in
+  order to avoid re-defining all abstract methods for each test sub-class.
+  """
+
+  @property
+  def parents(self):
+    raise ValueError('Should not use this method.')
+
+  @classmethod
+  def _from_config(cls, config, custom_objects=None, columns_by_name=None):
+    raise ValueError('Should not use this method.')
+
+  def _get_config(self):
+    raise ValueError('Should not use this method.')
+
+
 class LazyColumnTest(test.TestCase):
 
   def test_transformations_called_once(self):
 
-    class TransformCounter(fc.FeatureColumn):
+    class TransformCounter(BaseFeatureColumnForTests):
 
       def __init__(self):
         self.num_transform = 0
@@ -103,7 +124,7 @@ class LazyColumnTest(test.TestCase):
 
   def test_returns_transform_output(self):
 
-    class Transformer(fc.FeatureColumn):
+    class Transformer(BaseFeatureColumnForTests):
 
       @property
       def _is_v2_column(self):
@@ -128,7 +149,7 @@ class LazyColumnTest(test.TestCase):
 
   def test_does_not_pollute_given_features_dict(self):
 
-    class Transformer(fc.FeatureColumn):
+    class Transformer(BaseFeatureColumnForTests):
 
       @property
       def _is_v2_column(self):
@@ -162,7 +183,7 @@ class LazyColumnTest(test.TestCase):
 
   def test_not_supported_feature_column(self):
 
-    class NotAProperColumn(fc.FeatureColumn):
+    class NotAProperColumn(BaseFeatureColumnForTests):
 
       @property
       def _is_v2_column(self):
@@ -410,6 +431,28 @@ class NumericColumnTest(test.TestCase):
         sess.run(price_var.assign([[10.]]))
         self.assertAllClose([[10.], [50.]], predictions.eval())
 
+  def test_serialization(self):
+
+    def _increment_two(input_tensor):
+      return input_tensor + 2.
+
+    price = fc.numeric_column('price', normalizer_fn=_increment_two)
+    self.assertEqual(['price'], price.parents)
+
+    config = price._get_config()
+    self.assertEqual({
+        'key': 'price',
+        'shape': (1,),
+        'default_value': None,
+        'dtype': 'float32',
+        'normalizer_fn': '_increment_two'
+    }, config)
+
+    self.assertEqual(
+        price,
+        fc.NumericColumn._from_config(
+            config, custom_objects={'_increment_two': _increment_two}))
+
 
 class BucketizedColumnTest(test.TestCase):
 
@@ -725,6 +768,35 @@ class BucketizedColumnTest(test.TestCase):
         sess.run(bias.assign([1.]))
         self.assertAllClose([[11.], [21.], [41.], [51.]], predictions.eval())
 
+  def test_serialization(self):
+    price = fc.numeric_column('price', shape=[2])
+    bucketized_price = fc.bucketized_column(price, boundaries=[0, 2, 4, 6])
+    self.assertEqual([price], bucketized_price.parents)
+
+    config = bucketized_price._get_config()
+    self.assertEqual({
+        'source_column': {
+            'class_name': 'NumericColumn',
+            'config': {
+                'key': 'price',
+                'shape': (2,),
+                'default_value': None,
+                'dtype': 'float32',
+                'normalizer_fn': None
+            }
+        },
+        'boundaries': (0, 2, 4, 6)
+    }, config)
+
+    new_bucketized_price = fc.BucketizedColumn._from_config(config)
+    self.assertEqual(bucketized_price, new_bucketized_price)
+    self.assertIsNot(price, new_bucketized_price.source_column)
+
+    new_bucketized_price = fc.BucketizedColumn._from_config(
+        config, columns_by_name={price.name: price})
+    self.assertEqual(bucketized_price, new_bucketized_price)
+    self.assertIs(price, new_bucketized_price.source_column)
+
 
 class HashedCategoricalColumnTest(test.TestCase):
 
@@ -951,6 +1023,20 @@ class HashedCategoricalColumnTest(test.TestCase):
         # 'skywalker' -> 2, 'omar' -> 2: wire_var[2] + wire_var[2] = 3+3 = 6
         self.assertAllClose(((4.,), (6.,)), predictions.eval())
 
+  def test_serialization(self):
+    wire_column = fc.categorical_column_with_hash_bucket('wire', 4)
+    self.assertEqual(['wire'], wire_column.parents)
+
+    config = wire_column._get_config()
+    self.assertEqual({
+        'key': 'wire',
+        'hash_bucket_size': 4,
+        'dtype': 'string'
+    }, config)
+
+    self.assertEqual(wire_column,
+                     fc.HashedCategoricalColumn._from_config(config))
+
 
 class CrossedColumnTest(test.TestCase):
 
@@ -1200,7 +1286,8 @@ class CrossedColumnTest(test.TestCase):
 
   def test_linear_model_with_weights(self):
 
-    class _TestColumnWithWeights(fc.CategoricalColumn):
+    class _TestColumnWithWeights(BaseFeatureColumnForTests,
+                                 fc.CategoricalColumn):
       """Produces sparse IDs and sparse weights."""
 
       @property
@@ -1292,7 +1379,8 @@ class CrossedColumnTest(test.TestCase):
 
   def test_old_linear_model_with_weights(self):
 
-    class _TestColumnWithWeights(fc.CategoricalColumn,
+    class _TestColumnWithWeights(BaseFeatureColumnForTests,
+                                 fc.CategoricalColumn,
                                  fc_old._CategoricalColumn):
       """Produces sparse IDs and sparse weights."""
 
@@ -1399,6 +1487,47 @@ class CrossedColumnTest(test.TestCase):
         sess.run(bias.assign((.1,)))
         self.assertAllClose(((3.1,), (14.1,)), predictions.eval())
 
+  def test_serialization(self):
+    a = fc.numeric_column('a', dtype=dtypes.int32, shape=(2,))
+    b = fc.bucketized_column(a, boundaries=(0, 1))
+    crossed = fc.crossed_column([b, 'c'], hash_bucket_size=5, hash_key=5)
+
+    self.assertEqual([b, 'c'], crossed.parents)
+
+    config = crossed._get_config()
+    self.assertEqual({
+        'hash_bucket_size':
+            5,
+        'hash_key':
+            5,
+        'keys': ({
+            'config': {
+                'boundaries': (0, 1),
+                'source_column': {
+                    'config': {
+                        'dtype': 'int32',
+                        'default_value': None,
+                        'key': 'a',
+                        'normalizer_fn': None,
+                        'shape': (2,)
+                    },
+                    'class_name': 'NumericColumn'
+                }
+            },
+            'class_name': 'BucketizedColumn'
+        }, 'c')
+    }, config)
+
+    new_crossed = fc.CrossedColumn._from_config(config)
+    self.assertEqual(crossed, new_crossed)
+    self.assertIsNot(b, new_crossed.keys[0])
+
+    new_crossed = fc.CrossedColumn._from_config(
+        config, columns_by_name={b.name: b})
+    self.assertEqual(crossed, new_crossed)
+    self.assertIs(b, new_crossed.keys[0])
+
+
 
 class LinearModelTest(test.TestCase):
 
@@ -1413,7 +1542,7 @@ class LinearModelTest(test.TestCase):
 
   def test_should_be_dense_or_categorical_column(self):
 
-    class NotSupportedColumn(fc.FeatureColumn):
+    class NotSupportedColumn(BaseFeatureColumnForTests):
 
       @property
       def _is_v2_column(self):
@@ -1446,6 +1575,14 @@ class LinearModelTest(test.TestCase):
           feature_columns=[fc.numeric_column('a'),
                            fc.numeric_column('a')])
 
+  def test_not_dict_input_features(self):
+    price = fc.numeric_column('price')
+    with ops.Graph().as_default():
+      features = [[1.], [5.]]
+      model = fc.LinearModel([price])
+      with self.assertRaisesRegexp(ValueError, 'We expected a dictionary here'):
+        predictions = model(features)
+
   def test_dense_bias(self):
     price = fc.numeric_column('price')
     with ops.Graph().as_default():
@@ -1498,7 +1635,8 @@ class LinearModelTest(test.TestCase):
   def test_dense_and_sparse_column(self):
     """When the column is both dense and sparse, uses sparse tensors."""
 
-    class _DenseAndSparseColumn(fc.DenseColumn, fc.CategoricalColumn):
+    class _DenseAndSparseColumn(BaseFeatureColumnForTests, fc.DenseColumn,
+                                fc.CategoricalColumn):
 
       @property
       def _is_v2_column(self):
@@ -1816,6 +1954,8 @@ class LinearModelTest(test.TestCase):
           'sparse_feature': [['a'], ['x']],
       }
       model(features)
+      for var in model.variables:
+        self.assertTrue(isinstance(var, variables_lib.RefVariable))
       variable_names = [var.name for var in model.variables]
       self.assertItemsEqual([
           'linear_model/dense_feature_bucketized/weights:0',
@@ -1825,6 +1965,23 @@ class LinearModelTest(test.TestCase):
           'linear_model/bias_weights:0',
       ], variable_names)
 
+  def test_fit_and_predict(self):
+    columns = [fc.numeric_column('a')]
+
+    model = fc.LinearModel(columns)
+    model.compile(
+        optimizer=rmsprop.RMSPropOptimizer(1e-3),
+        loss='categorical_crossentropy',
+        metrics=['accuracy'])
+
+    x = {'a': np.random.random((10, 1))}
+    y = np.random.randint(20, size=(10, 1))
+    y = keras.utils.to_categorical(y, num_classes=20)
+    model.fit(x, y, epochs=1, batch_size=5)
+    model.fit(x, y, epochs=1, batch_size=5)
+    model.evaluate(x, y, batch_size=5)
+    model.predict(x, batch_size=5)
+
   def test_static_batch_size_mismatch(self):
     price1 = fc.numeric_column('price1')
     price2 = fc.numeric_column('price2')
@@ -2063,7 +2220,8 @@ class OldLinearModelTest(test.TestCase):
 
   def test_should_be_dense_or_categorical_column(self):
 
-    class NotSupportedColumn(fc.FeatureColumn, fc_old._FeatureColumn):
+    class NotSupportedColumn(BaseFeatureColumnForTests, fc.FeatureColumn,
+                             fc_old._FeatureColumn):
 
       @property
       def _is_v2_column(self):
@@ -2159,8 +2317,9 @@ class OldLinearModelTest(test.TestCase):
   def test_dense_and_sparse_column(self):
     """When the column is both dense and sparse, uses sparse tensors."""
 
-    class _DenseAndSparseColumn(fc.DenseColumn, fc.CategoricalColumn,
-                                fc_old._DenseColumn, fc_old._CategoricalColumn):
+    class _DenseAndSparseColumn(BaseFeatureColumnForTests, fc.DenseColumn,
+                                fc.CategoricalColumn, fc_old._DenseColumn,
+                                fc_old._CategoricalColumn):
 
       @property
       def _is_v2_column(self):
@@ -4046,7 +4205,7 @@ class FunctionalInputLayerTest(test.TestCase):
 
 class MakeParseExampleSpecTest(test.TestCase):
 
-  class _TestFeatureColumn(fc.FeatureColumn,
+  class _TestFeatureColumn(BaseFeatureColumnForTests,
                            collections.namedtuple('_TestFeatureColumn',
                                                   ('parse_spec'))):
 
@@ -4605,6 +4764,28 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
         # 'skywalker' -> 3, 'omar' -> 0: wire_var[3] + wire_var[0] = 4+1 = 5
         self.assertAllClose(((3.,), (5.,)), predictions.eval())
 
+  def test_serialization(self):
+    wire_column = fc.categorical_column_with_vocabulary_file(
+        key='wire',
+        vocabulary_file=self._wire_vocabulary_file_name,
+        vocabulary_size=self._wire_vocabulary_size,
+        num_oov_buckets=1)
+
+    self.assertEqual(['wire'], wire_column.parents)
+
+    config = wire_column._get_config()
+    self.assertEqual({
+        'default_value': -1,
+        'dtype': 'string',
+        'key': 'wire',
+        'num_oov_buckets': 1,
+        'vocabulary_file': self._wire_vocabulary_file_name,
+        'vocabulary_size': 3
+    }, config)
+
+    self.assertEqual(wire_column,
+                     fc.VocabularyFileCategoricalColumn._from_config(config))
+
 
 class VocabularyListCategoricalColumnTest(test.TestCase):
 
@@ -5006,6 +5187,27 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
         # 'skywalker' -> 3, 'omar' -> 0: wire_var[3] + wire_var[0] = 4+1 = 5
         self.assertAllClose(((3.,), (5.,)), predictions.eval())
 
+  def test_serialization(self):
+    wire_column = fc.categorical_column_with_vocabulary_list(
+        key='aaa',
+        vocabulary_list=('omar', 'stringer', 'marlo'),
+        num_oov_buckets=1)
+
+    self.assertEqual(['aaa'], wire_column.parents)
+
+    config = wire_column._get_config()
+    self.assertEqual({
+        'default_value': -1,
+        'dtype': 'string',
+        'key': 'aaa',
+        'num_oov_buckets': 1,
+        'vocabulary_list': ('omar', 'stringer', 'marlo')
+    }, config)
+
+    self.assertEqual(wire_column,
+                     fc.VocabularyListCategoricalColumn._from_config(config))
+
+
 
 class IdentityCategoricalColumnTest(test.TestCase):
 
@@ -5260,6 +5462,20 @@ class IdentityCategoricalColumnTest(test.TestCase):
         # weight_var[2] + weight_var[1] = 3+2 = 5
         self.assertAllClose(((1.,), (5.,)), predictions.eval())
 
+  def test_serialization(self):
+    column = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
+
+    self.assertEqual(['aaa'], column.parents)
+
+    config = column._get_config()
+    self.assertEqual({
+        'default_value': None,
+        'key': 'aaa',
+        'number_buckets': 3
+    }, config)
+
+    self.assertEqual(column, fc.IdentityCategoricalColumn._from_config(config))
+
 
 class TransformFeaturesTest(test.TestCase):
 
@@ -5289,7 +5505,7 @@ class TransformFeaturesTest(test.TestCase):
   def test_column_order(self):
     """When the column is both dense and sparse, uses sparse tensors."""
 
-    class _LoggerColumn(fc.FeatureColumn):
+    class _LoggerColumn(BaseFeatureColumnForTests):
 
       def __init__(self, name):
         self._name = name
@@ -5578,6 +5794,34 @@ class IndicatorColumnTest(test.TestCase):
       with _initialized_session():
         self.assertAllClose([[0., 1., 1., 0.]], net.eval())
 
+  def test_serialization(self):
+    parent = fc.categorical_column_with_identity('animal', num_buckets=4)
+    animal = fc.indicator_column(parent)
+
+    self.assertEqual([parent], animal.parents)
+
+    config = animal._get_config()
+    self.assertEqual({
+        'categorical_column': {
+            'class_name': 'IdentityCategoricalColumn',
+            'config': {
+                'key': 'animal',
+                'default_value': None,
+                'number_buckets': 4
+            }
+        }
+    }, config)
+
+    new_animal = fc.IndicatorColumn._from_config(config)
+    self.assertEqual(animal, new_animal)
+    self.assertIsNot(parent, new_animal.categorical_column)
+
+    new_animal = fc.IndicatorColumn._from_config(
+        config, columns_by_name={parent.name: parent})
+    self.assertEqual(animal, new_animal)
+    self.assertIs(parent, new_animal.categorical_column)
+
+
 
 class _TestStateManager(fc.StateManager):
 
@@ -5592,6 +5836,7 @@ class _TestStateManager(fc.StateManager):
                       shape,
                       dtype=None,
                       trainable=True,
+                      use_resource=True,
                       initializer=None):
     if feature_column not in self._all_variables:
       self._all_variables[feature_column] = {}
@@ -5604,6 +5849,7 @@ class _TestStateManager(fc.StateManager):
           shape=shape,
           dtype=dtype,
           trainable=self._trainable and trainable,
+          use_resource=use_resource,
           initializer=initializer)
       var_dict[name] = var
       return var
@@ -6182,6 +6428,8 @@ class EmbeddingColumnTest(test.TestCase):
     global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
     self.assertItemsEqual(('feature_layer/aaa_embedding/embedding_weights:0',),
                           tuple([v.name for v in global_vars]))
+    for v in global_vars:
+      self.assertTrue(isinstance(v, variables_lib.RefVariable))
     trainable_vars = ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
     self.assertItemsEqual(('feature_layer/aaa_embedding/embedding_weights:0',),
                           tuple([v.name for v in trainable_vars]))
@@ -6462,6 +6710,56 @@ class EmbeddingColumnTest(test.TestCase):
         # = [4*7 + 6*11, 4*2 + 6*3.5, 4*0 + 6*0, 4*3 + 6*5] = [94, 29, 0, 42]
         self.assertAllClose(((94.,), (29.,), (0.,), (42.,)), predictions.eval())
 
+  def test_serialization(self):
+
+    def _initializer(shape, dtype, partition_info):
+      del shape, dtype, partition_info
+      return ValueError('Not expected to be called')
+
+    # Build columns.
+    categorical_column = fc.categorical_column_with_identity(
+        key='aaa', num_buckets=3)
+    embedding_column = fc.embedding_column(
+        categorical_column, dimension=2, initializer=_initializer)
+
+    self.assertEqual([categorical_column], embedding_column.parents)
+
+    config = embedding_column._get_config()
+    self.assertEqual({
+        'categorical_column': {
+            'class_name': 'IdentityCategoricalColumn',
+            'config': {
+                'number_buckets': 3,
+                'key': 'aaa',
+                'default_value': None
+            }
+        },
+        'ckpt_to_load_from': None,
+        'combiner': 'mean',
+        'dimension': 2,
+        'initializer': '_initializer',
+        'max_norm': None,
+        'tensor_name_in_ckpt': None,
+        'trainable': True
+    }, config)
+
+    custom_objects = {
+        '_initializer': _initializer,
+    }
+
+    new_embedding_column = fc.EmbeddingColumn._from_config(
+        config, custom_objects=custom_objects)
+    self.assertEqual(embedding_column, new_embedding_column)
+    self.assertIsNot(categorical_column,
+                     new_embedding_column.categorical_column)
+
+    new_embedding_column = fc.EmbeddingColumn._from_config(
+        config,
+        custom_objects=custom_objects,
+        columns_by_name={categorical_column.name: categorical_column})
+    self.assertEqual(embedding_column, new_embedding_column)
+    self.assertIs(categorical_column, new_embedding_column.categorical_column)
+
 
 class SharedEmbeddingColumnTest(test.TestCase):
 
@@ -6964,6 +7262,8 @@ class SharedEmbeddingColumnTest(test.TestCase):
     self.assertItemsEqual(
         ['aaa_bbb_shared_embedding:0', 'ccc_ddd_shared_embedding:0'],
         tuple([v.name for v in global_vars]))
+    for v in global_vars:
+      self.assertTrue(isinstance(v, variables_lib.RefVariable))
     trainable_vars = ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
     if trainable:
       self.assertItemsEqual(
@@ -6982,6 +7282,26 @@ class SharedEmbeddingColumnTest(test.TestCase):
   def test_feature_layer_no_trainable(self):
     self._test_feature_layer(trainable=False)
 
+  def test_serialization(self):
+
+    def _initializer(shape, dtype, partition_info):
+      del shape, dtype, partition_info
+      return ValueError('Not expected to be called')
+
+    categorical_column_a = fc.categorical_column_with_identity(
+        key='aaa', num_buckets=3)
+    categorical_column_b = fc.categorical_column_with_identity(
+        key='bbb', num_buckets=3)
+    embedding_column_a, embedding_column_b = fc.shared_embedding_columns_v2(
+        [categorical_column_a, categorical_column_b],
+        dimension=2,
+        initializer=_initializer)
+
+    self.assertEqual([categorical_column_a], embedding_column_a.parents)
+    self.assertEqual([categorical_column_b], embedding_column_b.parents)
+    # TODO(rohanj): Add tests for (from|get)_config once implemented
+
+
 
 class WeightedCategoricalColumnTest(test.TestCase):
 
@@ -7430,5 +7750,136 @@ class WeightedCategoricalColumnTest(test.TestCase):
 
   # TODO(ptucker): Add test with embedding of weighted categorical.
 
+  def test_serialization(self):
+    categorical_column = fc.categorical_column_with_identity(
+        key='ids', num_buckets=3)
+    column = fc.weighted_categorical_column(
+        categorical_column=categorical_column, weight_feature_key='weight')
+
+    self.assertEqual([categorical_column, 'weight'], column.parents)
+
+    config = column._get_config()
+    self.assertEqual({
+        'categorical_column': {
+            'config': {
+                'key': 'ids',
+                'number_buckets': 3,
+                'default_value': None
+            },
+            'class_name': 'IdentityCategoricalColumn'
+        },
+        'dtype': 'float32',
+        'weight_feature_key': 'weight'
+    }, config)
+
+    self.assertEqual(column, fc.WeightedCategoricalColumn._from_config(config))
+
+    new_column = fc.WeightedCategoricalColumn._from_config(
+        config, columns_by_name={categorical_column.name: categorical_column})
+    self.assertEqual(column, new_column)
+    self.assertIs(categorical_column, new_column.categorical_column)
+
+
+class FeatureColumnForSerializationTest(BaseFeatureColumnForTests):
+
+  @property
+  def _is_v2_column(self):
+    return True
+
+  @property
+  def name(self):
+    return 'BadParentsFeatureColumn'
+
+  def transform_feature(self, transformation_cache, state_manager):
+    return 'Output'
+
+  @property
+  def parse_example_spec(self):
+    pass
+
+
+class SerializationTest(test.TestCase):
+  """Tests for serialization, deserialization helpers."""
+
+  def test_serialize_non_feature_column(self):
+
+    class NotAFeatureColumn(object):
+      pass
+
+    with self.assertRaisesRegexp(ValueError, 'is not a FeatureColumn'):
+      fc.serialize_feature_column(NotAFeatureColumn())
+
+  def test_deserialize_invalid_config(self):
+    with self.assertRaisesRegexp(ValueError, 'Improper config format: {}'):
+      fc.deserialize_feature_column({})
+
+  def test_deserialize_config_missing_key(self):
+    config_missing_key = {
+        'config': {
+            # Dtype is missing and should cause a failure.
+            # 'dtype': 'int32',
+            'default_value': None,
+            'key': 'a',
+            'normalizer_fn': None,
+            'shape': (2,)
+        },
+        'class_name': 'NumericColumn'
+    }
+    with self.assertRaisesRegexp(ValueError, 'Invalid config:'):
+      fc.deserialize_feature_column(config_missing_key)
+
+  def test_deserialize_invalid_class(self):
+    with self.assertRaisesRegexp(
+        ValueError, 'Unknown feature_column_v2: NotExistingFeatureColumnClass'):
+      fc.deserialize_feature_column({
+          'class_name': 'NotExistingFeatureColumnClass',
+          'config': {}
+      })
+
+  def test_deserialization_deduping(self):
+    price = fc.numeric_column('price')
+    bucketized_price = fc.bucketized_column(price, boundaries=[0, 1])
+
+    configs = fc.serialize_feature_columns([price, bucketized_price])
+
+    deserialized_feature_columns = fc.deserialize_feature_columns(configs)
+    self.assertEqual(2, len(deserialized_feature_columns))
+    new_price = deserialized_feature_columns[0]
+    new_bucketized_price = deserialized_feature_columns[1]
+
+    # Ensure these are not the original objects:
+    self.assertIsNot(price, new_price)
+    self.assertIsNot(bucketized_price, new_bucketized_price)
+    # But they are equivalent:
+    self.assertEquals(price, new_price)
+    self.assertEquals(bucketized_price, new_bucketized_price)
+
+    # Check that deduping worked:
+    self.assertIs(new_bucketized_price.source_column, new_price)
+
+  def deserialization_custom_objects(self):
+    # Note that custom_objects is also tested extensively above per class, this
+    # test ensures that the public wrappers also handle it correctly.
+    def _custom_fn(input_tensor):
+      return input_tensor + 42.
+
+    price = fc.numeric_column('price', normalizer_fn=_custom_fn)
+
+    configs = fc.serialize_feature_columns([price])
+
+    deserialized_feature_columns = fc.deserialize_feature_columns(configs)
+
+    self.assertEqual(1, len(deserialized_feature_columns))
+    new_price = deserialized_feature_columns[0]
+
+    # Ensure these are not the original objects:
+    self.assertIsNot(price, new_price)
+    # But they are equivalent:
+    self.assertEquals(price, new_price)
+
+    # Check that normalizer_fn points to the correct function.
+    self.assertIs(new_price.normalizer_fn, _custom_fn)
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/framework/constant_op.py b/tensorflow/python/framework/constant_op.py
index 4b2706d4cf8..53d84b2dc76 100644
--- a/tensorflow/python/framework/constant_op.py
+++ b/tensorflow/python/framework/constant_op.py
@@ -106,12 +106,12 @@ def convert_to_eager_tensor(value, ctx, dtype=None):
     tensor = scalar_cache.get(cache_key, None)
     if tensor is not None:
       return ops.EagerTensor(
-          value, context=handle, device=device, dtype=dtype, other_value=tensor)
-    t = ops.EagerTensor(value, context=handle, device=device, dtype=dtype)
+          value, handle, device, dtype, tensor)
+    t = ops.EagerTensor(value, handle, device, dtype)
     scalar_cache[cache_key] = t
     return t
   else:
-    return ops.EagerTensor(value, context=handle, device=device, dtype=dtype)
+    return ops.EagerTensor(value, handle, device, dtype)
 
 
 @tf_export("constant")
diff --git a/tensorflow/python/framework/error_interpolation.py b/tensorflow/python/framework/error_interpolation.py
index bc3c81b2a2f..37a634d8067 100644
--- a/tensorflow/python/framework/error_interpolation.py
+++ b/tensorflow/python/framework/error_interpolation.py
@@ -40,6 +40,7 @@ _ParseTag = collections.namedtuple("_ParseTag", ["type", "name"])
 
 _BAD_FILE_SUBSTRINGS = [
     os.path.join("tensorflow", "python"),
+    os.path.join("tensorflow", "contrib"),
     "<embedded",
 ]
 
@@ -267,8 +268,8 @@ def compute_field_dict(op):
 def interpolate(error_message, graph):
   """Interpolates an error message.
 
-  The error message can contain tags of the form ^^type:name^^ which will
-  be replaced.
+  The error message can contain tags of the form `{{type name}}` which will be
+  replaced.
 
   Args:
     error_message: A string to interpolate.
diff --git a/tensorflow/python/framework/func_graph.py b/tensorflow/python/framework/func_graph.py
index 9a3751f4e51..c7a5d1ee201 100644
--- a/tensorflow/python/framework/func_graph.py
+++ b/tensorflow/python/framework/func_graph.py
@@ -35,7 +35,6 @@ from tensorflow.python.ops import variable_scope
 from tensorflow.python.util import compat
 from tensorflow.python.util import nest
 from tensorflow.python.util import tf_decorator
-from tensorflow.python.util import tf_inspect
 from tensorflow.python.util.lazy_loader import LazyLoader
 
 # This is to avoid a circular dependency:
@@ -296,7 +295,7 @@ def func_graph_from_py_func(name,
                             kwargs,
                             signature=None,
                             func_graph=None,
-                            experimental_autograph=False,
+                            autograph=False,
                             add_control_dependencies=True,
                             arg_names=None,
                             op_return_value=None):
@@ -316,7 +315,7 @@ def func_graph_from_py_func(name,
       inputs.
     func_graph: Optional. An instance of FuncGraph. If provided, we will use
       this graph else a new one is built and returned.
-    experimental_autograph: whether to use autograph to compile `python_func`.
+    autograph: whether to use autograph to compile `python_func`.
       See https://www.tensorflow.org/guide/autograph for more information.
     add_control_dependencies: If True, automatically adds control dependencies
       to ensure program order matches execution order and stateful ops always
@@ -388,29 +387,27 @@ def func_graph_from_py_func(name,
 
     this_tape = tape.push_new_tape()
     try:
-      if experimental_autograph:
+      if autograph:
         from tensorflow.python import autograph  # pylint: disable=g-import-not-at-top
         _, original_func = tf_decorator.unwrap(python_func)
 
-        # AutoGraph does not yet rebind the returned method, and must receive
-        # `self` explicitly.
-        # TODO(mdan): Have the result automatically bind it instead.
-        if (tf_inspect.ismethod(original_func) and
-            hasattr(original_func, "__self__")):
-          effective_func_args = (original_func.__self__,) + func_args
-        else:
-          effective_func_args = func_args
+        def wrapper(*args, **kwargs):
+          return autograph.converted_call(
+              original_func, None,
+              autograph.ConversionOptions(
+                  verbose=True,
+                  recursive=True,
+                  strip_decorators=(function.defun, def_function.function),
+                  optional_features=(),
+              ), *args, **kwargs)
+
+        # Wrapping around a decorator allows checks like tf_inspect.getargspec
+        # to be accurate.
+        converted_func = tf_decorator.make_decorator(original_func, wrapper)
+        tf_decorator.rewrap(python_func, original_func, converted_func)
+
+      func_outputs = python_func(*func_args, **func_kwargs)
 
-        func_outputs = autograph.converted_call(
-            original_func, None,
-            autograph.ConversionOptions(
-                verbose=True,
-                recursive=True,
-                strip_decorators=(function.defun, def_function.function),
-                optional_features=(),
-            ), *effective_func_args, **func_kwargs)
-      else:
-        func_outputs = python_func(*func_args, **func_kwargs)
       # invariant: `func_outputs` contains only Tensors and `None`s.
       func_outputs = nest.map_structure(convert, func_outputs)
 
diff --git a/tensorflow/python/framework/graph_io.py b/tensorflow/python/framework/graph_io.py
index 47e1344eaed..ee0fd227eec 100644
--- a/tensorflow/python/framework/graph_io.py
+++ b/tensorflow/python/framework/graph_io.py
@@ -27,7 +27,7 @@ from tensorflow.python.lib.io import file_io
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export('io.write_graph', 'train.write_graph')
+@tf_export('io.write_graph', v1=['io.write_graph', 'train.write_graph'])
 def write_graph(graph_or_graph_def, logdir, name, as_text=True):
   """Writes a graph proto to a file.
 
diff --git a/tensorflow/python/framework/graph_util_impl.py b/tensorflow/python/framework/graph_util_impl.py
index 394fac6c856..1b61ac925ce 100644
--- a/tensorflow/python/framework/graph_util_impl.py
+++ b/tensorflow/python/framework/graph_util_impl.py
@@ -29,6 +29,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
 _VARIABLE_OPS = {
@@ -50,7 +51,10 @@ def _is_variable_op(op):
   return op in _VARIABLE_OPS
 
 
-@tf_export("graph_util.must_run_on_cpu")
+@deprecation.deprecated(
+    date=None,
+    instructions="Use tf.compat.v1.graph_util.must_run_on_cpu")
+@tf_export(v1=["graph_util.must_run_on_cpu"])
 def must_run_on_cpu(node, pin_variables_on_cpu=False):
   """Returns True if the given node_def must run on CPU, otherwise False.
 
@@ -149,7 +153,10 @@ def _bfs_for_reachable_nodes(target_nodes, name_to_input_name):
   return nodes_to_keep
 
 
-@tf_export("graph_util.extract_sub_graph")
+@deprecation.deprecated(
+    date=None,
+    instructions="Use tf.compat.v1.graph_util.extract_sub_graph")
+@tf_export(v1=["graph_util.extract_sub_graph"])
 def extract_sub_graph(graph_def, dest_nodes):
   """Extract the subgraph that can reach any of the nodes in 'dest_nodes'.
 
@@ -187,7 +194,10 @@ def extract_sub_graph(graph_def, dest_nodes):
   return out
 
 
-@tf_export("graph_util.tensor_shape_from_node_def_name")
+@deprecation.deprecated(
+    date=None,
+    instructions="Use tf.compat.v1.graph_util.remove_training_nodes")
+@tf_export(v1=["graph_util.tensor_shape_from_node_def_name"])
 def tensor_shape_from_node_def_name(graph, input_name):
   """Convenience function to get a shape from a NodeDef's input string."""
   # To get a tensor, the name must be in the form <input>:<port>, for example
@@ -202,7 +212,10 @@ def tensor_shape_from_node_def_name(graph, input_name):
   return shape
 
 
-@tf_export("graph_util.convert_variables_to_constants")
+@deprecation.deprecated(
+    date=None,
+    instructions="Use tf.compat.v1.graph_util.convert_variables_to_constants")
+@tf_export(v1=["graph_util.convert_variables_to_constants"])
 def convert_variables_to_constants(sess,
                                    input_graph_def,
                                    output_node_names,
@@ -289,7 +302,10 @@ def convert_variables_to_constants(sess,
   return output_graph_def
 
 
-@tf_export("graph_util.remove_training_nodes")
+@deprecation.deprecated(
+    date=None,
+    instructions="Use tf.compat.v1.graph_util.remove_training_nodes")
+@tf_export(v1=["graph_util.remove_training_nodes"])
 def remove_training_nodes(input_graph, protected_nodes=None):
   """Prunes out nodes that aren't needed for inference.
 
diff --git a/tensorflow/python/framework/meta_graph.py b/tensorflow/python/framework/meta_graph.py
index 33631282bd0..ddf6f66e8ab 100644
--- a/tensorflow/python/framework/meta_graph.py
+++ b/tensorflow/python/framework/meta_graph.py
@@ -462,7 +462,7 @@ def _is_default_attr_value(op_def, attr_name, attr_value):
   return False
 
 
-def _strip_graph_default_valued_attrs(meta_graph_def):
+def strip_graph_default_valued_attrs(meta_graph_def):
   """Strips default valued attributes for node defs in given MetaGraphDef.
 
   This method also sets `meta_info_def.stripped_default_attrs` in the given
@@ -587,7 +587,7 @@ def create_meta_graph_def(meta_info_def=None,
 
   # Strip default valued attributes in graph_def.
   if strip_default_attrs:
-    _strip_graph_default_valued_attrs(meta_graph_def)
+    strip_graph_default_valued_attrs(meta_graph_def)
 
   # Adds saver_def.
   if saver_def:
diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py
index 14e4c0ca41a..aaa12bf71ff 100644
--- a/tensorflow/python/framework/ops.py
+++ b/tensorflow/python/framework/ops.py
@@ -1001,7 +1001,7 @@ _tensor_conversion_func_lock = threading.Lock()
 register_dense_tensor_like_type(Tensor)
 
 
-@tf_export("convert_to_tensor")
+@tf_export(v1=["convert_to_tensor"])
 def convert_to_tensor(value, dtype=None, name=None, preferred_dtype=None):
   """Converts the given `value` to a `Tensor`.
 
@@ -1050,12 +1050,65 @@ def convert_to_tensor(value, dtype=None, name=None, preferred_dtype=None):
     TypeError: If no conversion function is registered for `value`.
     RuntimeError: If a registered conversion function returns an invalid value.
 
+  """
+  return convert_to_tensor_v2(value, dtype, preferred_dtype, name)
+
+
+@tf_export("convert_to_tensor", v1=[])
+def convert_to_tensor_v2(value, dtype=None, dtype_hint=None, name=None):
+  """Converts the given `value` to a `Tensor`.
+
+  This function converts Python objects of various types to `Tensor`
+  objects. It accepts `Tensor` objects, numpy arrays, Python lists,
+  and Python scalars. For example:
+
+  ```python
+  import numpy as np
+
+  def my_func(arg):
+    arg = tf.convert_to_tensor(arg, dtype=tf.float32)
+    return tf.matmul(arg, arg) + arg
+
+  # The following calls are equivalent.
+  value_1 = my_func(tf.constant([[1.0, 2.0], [3.0, 4.0]]))
+  value_2 = my_func([[1.0, 2.0], [3.0, 4.0]])
+  value_3 = my_func(np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32))
+  ```
+
+  This function can be useful when composing a new operation in Python
+  (such as `my_func` in the example above). All standard Python op
+  constructors apply this function to each of their Tensor-valued
+  inputs, which allows those ops to accept numpy arrays, Python lists,
+  and scalars in addition to `Tensor` objects.
+
+  Note: This function diverges from default Numpy behavior for `float` and
+    `string` types when `None` is present in a Python list or scalar. Rather
+    than silently converting `None` values, an error will be thrown.
+
+  Args:
+    value: An object whose type has a registered `Tensor` conversion function.
+    dtype: Optional element type for the returned tensor. If missing, the
+      type is inferred from the type of `value`.
+    dtype_hint: Optional element type for the returned tensor,
+      used when dtype is None. In some cases, a caller may not have a
+      dtype in mind when converting to a tensor, so dtype_hint
+      can be used as a soft preference.  If the conversion to
+      `dtype_hint` is not possible, this argument has no effect.
+    name: Optional name to use if a new `Tensor` is created.
+
+  Returns:
+    An `Output` based on `value`.
+
+  Raises:
+    TypeError: If no conversion function is registered for `value`.
+    RuntimeError: If a registered conversion function returns an invalid value.
+
   """
   return internal_convert_to_tensor(
       value=value,
       dtype=dtype,
       name=name,
-      preferred_dtype=preferred_dtype,
+      preferred_dtype=dtype_hint,
       as_ref=False)
 
 
@@ -1068,7 +1121,8 @@ def internal_convert_to_tensor(value,
                                name=None,
                                as_ref=False,
                                preferred_dtype=None,
-                               ctx=None):
+                               ctx=None,
+                               accept_symbolic_tensors=True):
   """Converts the given `value` to an `Tensor`.
 
   This function converts Python objects of various types to `Tensor`
@@ -1092,6 +1146,10 @@ def internal_convert_to_tensor(value,
       can be used as a soft preference.  If the conversion to
       `preferred_dtype` is not possible, this argument has no effect.
     ctx: Optional: The value of context.context().
+    accept_symbolic_tensors: Whether Keras graph tensors should be accepted as
+      a valid tensor type during eager execution.
+      If False, this function will raise an exception if it is passed such
+      a tensor during eager eager execution.
 
   Returns:
     A `Tensor` based on `value`.
@@ -1115,6 +1173,19 @@ def internal_convert_to_tensor(value,
         raise RuntimeError("Attempting to capture an EagerTensor without "
                            "building a function.")
       return graph.capture(value, name=name)
+  elif ((not accept_symbolic_tensors) and
+        isinstance(value, Tensor) and
+        ctx.executing_eagerly()):
+    # Found a symbolic tensor in an eager context.
+    # This happens when we use the Keras functional API (i.e. calling layers
+    # on the output of `keras.Input()`, which is symbolic) while eager
+    # execution is enabled.
+    if _is_keras_symbolic_tensor(value):
+      # If the graph of the tensor isn't the Keras graph, we should still
+      # fail, for the time being. TODO(fchollet): consider allowing
+      # all symbolic tensors to raise this exception in this case.
+      raise core._SymbolicException(  # pylint: disable=protected-access
+          "Using the symbolic output of a Keras layer during eager execution.")
 
   if dtype is not None:
     dtype = dtypes.as_dtype(dtype)
@@ -1253,7 +1324,7 @@ def convert_n_to_tensor(values, dtype=None, name=None, preferred_dtype=None):
       as_ref=False)
 
 
-@tf_export("convert_to_tensor_or_indexed_slices")
+@tf_export(v1=["convert_to_tensor_or_indexed_slices"])
 def convert_to_tensor_or_indexed_slices(value, dtype=None, name=None):
   """Converts the given object to a `Tensor` or an `IndexedSlices`.
 
@@ -2424,8 +2495,9 @@ class RegisterGradient(object):
     return f
 
 
-@tf_export("NoGradient", "NotDifferentiable")
-def NotDifferentiable(op_type):
+@deprecation.deprecated_endpoints("NotDifferentiable", "NoGradient")
+@tf_export("no_gradient", v1=["no_gradient", "NotDifferentiable", "NoGradient"])
+def no_gradient(op_type):
   """Specifies that ops of type `op_type` is not differentiable.
 
   This function should *not* be used for operations that have a
@@ -2458,8 +2530,9 @@ def NotDifferentiable(op_type):
   _gradient_registry.register(None, op_type)
 
 
-# Alias for the old name, will be eventually removed.
-NoGradient = NotDifferentiable
+# Aliases for the old names, will be eventually removed.
+NoGradient = no_gradient
+NotDifferentiable = no_gradient
 
 
 def get_gradient_function(op):
@@ -4913,7 +4986,7 @@ class Graph(object):
 # apply to inner graph mode code. Fix that.
 
 
-@tf_export("device")
+@tf_export(v1=["device"])
 def device(device_name_or_function):
   """Wrapper for `Graph.device()` using the default graph.
 
@@ -4943,7 +5016,41 @@ def device(device_name_or_function):
     return get_default_graph().device(device_name_or_function)
 
 
-@tf_export("container")
+@tf_export("device", v1=[])
+def device_v2(device_name):
+  """Specifies the device for ops created/executed in this context.
+
+  `device_name` can be fully specified, as in "/job:worker/task:1/device:cpu:0",
+  or partially specified, containing only a subset of the "/"-separated
+  fields. Any fields which are specified override device annotations from outer
+  scopes. For example:
+
+  with tf.device('/job:foo'):
+    # ops created here have devices with /job:foo
+    with tf.device('/job:bar/task:0/device:gpu:2'):
+      # ops created here have the fully specified device above
+    with tf.device('/device:gpu:1'):
+      # ops created here have the device '/job:foo/device:gpu:1'
+
+  Args:
+    device_name: The device name to use in the context.
+
+  Returns:
+    A context manager that specifies the default device to use for newly
+    created ops.
+
+  Raises:
+    RuntimeError: If a function is passed in.
+  """
+  if callable(device_name):
+    raise RuntimeError("tf.device does not support functions.")
+  if context.executing_eagerly():
+    return context.device(device_name)
+  else:
+    return get_default_graph().device(device_name)
+
+
+@tf_export(v1=["container"])
 def container(container_name):
   """Wrapper for `Graph.container()` using the default graph.
 
@@ -5857,7 +5964,7 @@ def dismantle_graph(graph):
   graph.__dict__ = {}
 
 
-@tf_export("add_to_collection")
+@tf_export(v1=["add_to_collection"])
 def add_to_collection(name, value):
   """Wrapper for `Graph.add_to_collection()` using the default graph.
 
@@ -5876,7 +5983,8 @@ def add_to_collection(name, value):
   """
   get_default_graph().add_to_collection(name, value)
 
-@tf_export("add_to_collections")
+
+@tf_export(v1=["add_to_collections"])
 def add_to_collections(names, value):
   """Wrapper for `Graph.add_to_collections()` using the default graph.
 
@@ -6000,6 +6108,13 @@ class name_scope(object):  # pylint: disable=invalid-name
     self._values = values
     self._ctx = context.context()
     self._in_eager_mode = self._ctx.executing_eagerly()
+    self._has_symbolic_input_in_eager = False
+    if self._values and self._in_eager_mode:
+      # The presence of a graph tensor in `self._values` overrides the context.
+      for value in self._values:
+        if hasattr(value, "graph"):
+          self._has_symbolic_input_in_eager = True
+          self._name_scope = value.graph.name_scope(self._name)
 
   def __enter__(self):
     """Start the scope block.
@@ -6011,6 +6126,9 @@ class name_scope(object):  # pylint: disable=invalid-name
       ValueError: if neither `name` nor `default_name` is provided
         but `values` are.
     """
+    if self._has_symbolic_input_in_eager:
+      return self._name_scope.__enter__()
+
     if self._in_eager_mode:
       self._old_name = self._ctx.scope_name
       if not self._name:
@@ -6053,7 +6171,9 @@ class name_scope(object):  # pylint: disable=invalid-name
         raise
 
   def __exit__(self, type_arg, value_arg, traceback_arg):
-    if self._in_eager_mode:
+    if self._has_symbolic_input_in_eager:
+      self._name_scope.__exit__(type_arg, value_arg, traceback_arg)
+    elif self._in_eager_mode:
       self._ctx.scope_name = self._old_name
     else:
       self._name_scope.__exit__(type_arg, value_arg, traceback_arg)
@@ -6118,7 +6238,7 @@ def prepend_name_scope(name, import_scope):
 
 # pylint: disable=g-doc-return-or-yield
 # pylint: disable=not-context-manager
-@tf_export("op_scope")
+@tf_export(v1=["op_scope"])
 @tf_contextlib.contextmanager
 def op_scope(values, name, default_name=None):
   """DEPRECATED. Same as name_scope above, just different argument order."""
@@ -6213,4 +6333,8 @@ def _op_to_colocate_with(v):
   return internal_convert_to_tensor_or_indexed_slices(v, as_ref=True).op
 
 
+def _is_keras_symbolic_tensor(x):
+  return hasattr(x, "graph") and getattr(x.graph, "name", None) == "keras_graph"
+
+
 register_tensor_conversion_function(Operation, _operation_conversion_error)
diff --git a/tensorflow/python/framework/python_op_gen.cc b/tensorflow/python/framework/python_op_gen.cc
index 2022fbcbaad..465016b8087 100644
--- a/tensorflow/python/framework/python_op_gen.cc
+++ b/tensorflow/python/framework/python_op_gen.cc
@@ -355,15 +355,12 @@ string GenEagerPythonOp::Code() {
 }
 
 void GenEagerPythonOp::HandleGraphMode(const string& function_setup) {
-  // Handle graph-mode case
-  strings::StrAppend(&result_,
-                     "  _ctx = _context._context\n"
-                     "  if _ctx is None or not _ctx._eager_context.is_eager:\n",
-                     function_setup,
-                     "    _, _, _op = _op_def_lib._apply_op_helper(\n");
+  strings::StrAppend(&result_, "  # Add nodes to the TensorFlow graph.\n");
+  strings::StrAppend(&result_, function_setup,
+                     "  _, _, _op = _op_def_lib._apply_op_helper(\n");
   AddBodyNoReturn("        ");
   if (num_outs_ > 0) {
-    strings::StrAppend(&result_, "    _result = _op.outputs[:]\n");
+    strings::StrAppend(&result_, "  _result = _op.outputs[:]\n");
     // Special case handling for stateful op with single list output
     // that might be empty.
     if (num_outs_ == 1 && op_def_.is_stateful() &&
@@ -372,10 +369,10 @@ void GenEagerPythonOp::HandleGraphMode(const string& function_setup) {
       // TODO(josh11b): Can skip this if the number_attr/type_list_attr has
       // a constraint indicating that this can never be empty.
       strings::StrAppend(&result_,
-                         "    if not _result:\n"
-                         "      return _op\n");
+                         "  if not _result:\n"
+                         "    return _op\n");
     }
-    strings::StrAppend(&result_, "    _inputs_flat = _op.inputs\n");
+    strings::StrAppend(&result_, "  _inputs_flat = _op.inputs\n");
 
     // Compute graph-mode attrs.
     if (op_def_.attr_size() > 0) {
@@ -387,14 +384,13 @@ void GenEagerPythonOp::HandleGraphMode(const string& function_setup) {
                            attr_name, "\")");
       }
       strings::StrAppend(&attr_values, ")");
-      strings::StrAppend(&result_,
-                         WordWrap("    _attrs = (", attr_values, kRightMargin),
-                         "\n");
+      strings::StrAppend(
+          &result_, WordWrap("  _attrs = (", attr_values, kRightMargin), "\n");
     } else {
-      strings::StrAppend(&result_, "    _attrs = None\n");
+      strings::StrAppend(&result_, "  _attrs = None\n");
     }
   } else {
-    strings::StrAppend(&result_, "    return _op\n");
+    strings::StrAppend(&result_, "  return _op\n");
   }
 }
 
@@ -643,25 +639,26 @@ bool GenEagerPythonOp::AddEagerFastPathAndGraphCode(
   AddDocStringOutputs();
   strings::StrAppend(&result_, "  \"\"\"\n");
 
-  // Handle graph-mode case
-  string function_setup;
-  if (!GetEagerFunctionSetup("    ", &function_setup)) {
-    result_ = function_setup;
-    return false;
-  }
-  HandleGraphMode(function_setup);
-  AddEagerFunctionTeardown("    ", output_sizes,
-                           true /* execute_record_gradient */);
-
-  // Handle eager-mode case
-  strings::StrAppend(&result_, "  else:\n");
-
+  strings::StrAppend(&result_,
+                     "  _ctx = _context._context\n"
+                     "  if _ctx is not None and _ctx._eager_context.is_eager:",
+                     "\n");
   if (eager_not_allowed_error.empty()) {
     AddEagerFastPathExecute();
   } else {
     strings::StrAppend(&result_, "    ", eager_not_allowed_error);
   }
 
+  // Handle graph-mode case
+  string function_setup;
+  if (!GetEagerFunctionSetup("  ", &function_setup)) {
+    result_ = function_setup;
+    return false;
+  }
+  HandleGraphMode(function_setup);
+  AddEagerFunctionTeardown("  ", output_sizes,
+                           true /* execute_record_gradient */);
+
   strings::StrAppend(&result_, "\n\n");
   return true;
 }
@@ -669,13 +666,14 @@ bool GenEagerPythonOp::AddEagerFastPathAndGraphCode(
 bool GenEagerPythonOp::AddEagerFallbackCode(
     const string& parameters, const std::vector<string>& output_sizes,
     const string& num_outputs_expr, const string& eager_not_allowed_error) {
+  AddDefLine(strings::StrCat(function_name_, kEagerFallbackSuffix),
+             strings::StrCat(parameters, ", ctx=None"));
+
   if (!eager_not_allowed_error.empty()) {
     strings::StrAppend(&result_, "  ", eager_not_allowed_error);
     return true;
   }
 
-  AddDefLine(strings::StrCat(function_name_, kEagerFallbackSuffix),
-             strings::StrCat(parameters, ", ctx=None"));
   strings::StrAppend(
       &result_, "  r\"\"\"This is the slowpath function for Eager mode.\n");
   strings::StrAppend(&result_, "  This is for function ", function_name_,
@@ -750,12 +748,16 @@ void GenEagerPythonOp::AddEagerFastPathExecute() {
   if (!fallback_params.empty()) strings::StrAppend(&fallback_params, ", ");
   strings::StrAppend(&fallback_params, "ctx=_ctx");
   strings::StrAppend(&result_, "    ", "except _core._FallbackException:\n");
+  strings::StrAppend(&result_, "      try:\n");
   strings::StrAppend(
-      &result_, "      ", "return ", function_name_, kEagerFallbackSuffix,
+      &result_, "        ", "return ", function_name_, kEagerFallbackSuffix,
       "(\n",
-      WordWrap(strings::StrCat("          "),
+      WordWrap(strings::StrCat("            "),
                strings::StrCat(fallback_params, ")"), kRightMargin),
       "\n");
+  strings::StrAppend(&result_, "      except _core._SymbolicException:\n");
+  strings::StrAppend(&result_,
+                     "        pass  # Add nodes to the TensorFlow graph.\n");
 
   // Any errors thrown from execute need to be unwrapped from
   // _NotOkStatusException.
diff --git a/tensorflow/python/framework/sparse_tensor.py b/tensorflow/python/framework/sparse_tensor.py
index 68f15f5e62a..3643fc5e004 100644
--- a/tensorflow/python/framework/sparse_tensor.py
+++ b/tensorflow/python/framework/sparse_tensor.py
@@ -248,7 +248,7 @@ tf_export("SparseTensorValue")(SparseTensorValue)
 pywrap_tensorflow.RegisterType("SparseTensorValue", SparseTensorValue)
 
 
-@tf_export("convert_to_tensor_or_sparse_tensor")
+@tf_export(v1=["convert_to_tensor_or_sparse_tensor"])
 def convert_to_tensor_or_sparse_tensor(value, dtype=None, name=None):
   """Converts value to a `SparseTensor` or `Tensor`.
 
diff --git a/tensorflow/python/framework/tensor_util.py b/tensorflow/python/framework/tensor_util.py
index 0582e986032..9db94f5288c 100644
--- a/tensorflow/python/framework/tensor_util.py
+++ b/tensorflow/python/framework/tensor_util.py
@@ -339,11 +339,29 @@ _TF_TO_IS_OK = {
     dtypes.string: [_FilterStr],
     dtypes.uint16: [_FilterInt],
     dtypes.uint8: [_FilterInt],
+    dtypes.uint32: [_FilterInt],
+    dtypes.uint64: [_FilterInt],
 }
 
 
 def _AssertCompatible(values, dtype):
-  fn_list = _TF_TO_IS_OK.get(dtype, [_FilterNotTensor])
+  if dtype is None:
+    fn_list = [_FilterNotTensor]
+  else:
+    try:
+      fn_list = _TF_TO_IS_OK[dtype]
+    except KeyError:
+      # There isn't a specific fn_list, so we try to do the best possible.
+      if dtype.is_integer:
+        fn_list = [_FilterInt]
+      elif dtype.is_floating:
+        fn_list = [_FilterFloat]
+      elif dtype.is_complex:
+        fn_list = [_FilterComplex]
+      elif dtype.is_quantized:
+        fn_list = [_FilterInt, _FilterTuple]
+      else:
+        fn_list = [_FilterNotTensor]
   mismatch = _FirstNotNone([fn(values) for fn in fn_list])
   if mismatch is not None:
     if dtype is None:
@@ -353,7 +371,7 @@ def _AssertCompatible(values, dtype):
                       (dtype.name, repr(mismatch), type(mismatch).__name__))
 
 
-@tf_export("make_tensor_proto")
+@tf_export(v1=["make_tensor_proto"])
 def make_tensor_proto(values, dtype=None, shape=None, verify_shape=False):
   """Create a TensorProto.
 
diff --git a/tensorflow/python/framework/test_util.py b/tensorflow/python/framework/test_util.py
index f561d16bc3e..fd55ad2af9e 100644
--- a/tensorflow/python/framework/test_util.py
+++ b/tensorflow/python/framework/test_util.py
@@ -53,7 +53,7 @@ from tensorflow.python import pywrap_tensorflow
 from tensorflow.python.client import device_lib
 from tensorflow.python.client import session
 from tensorflow.python.eager import context
-from tensorflow.python.eager import tape  # pylint: disable=unused-import
+from tensorflow.python.eager import tape
 from tensorflow.python.framework import device as pydev
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
@@ -61,6 +61,7 @@ from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import importer
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
+from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import versions
 from tensorflow.python.ops import array_ops
@@ -629,6 +630,109 @@ def assert_no_new_tensors(f):
   return decorator
 
 
+def _find_reference_cycle(objects, idx):
+
+  def get_ignore_reason(obj, blacklist):
+    """Tests whether an object should be omitted from the dependency graph."""
+    if len(blacklist) > 100:
+      return "<depth limit>"
+    if tf_inspect.isframe(obj):
+      if "test_util.py" in tf_inspect.getframeinfo(obj)[0]:
+        return "<test code>"
+    for b in blacklist:
+      if b is obj:
+        return "<test code>"
+    if obj is blacklist:
+      return "<test code>"
+    return None
+
+  # Note: this function is meant to help with diagnostics. Its output is purely
+  # a human readable representation, so you may freely modify it to suit your
+  # needs.
+  def describe(obj, blacklist, leaves_only=False):
+    """Returns a custom human-readable summary of obj.
+
+    Args:
+      obj: the value to describe.
+      blacklist: same as blacklist in get_ignore_reason.
+      leaves_only: boolean flag used when calling describe recursively. Useful
+        for summarizing collections.
+    """
+    if get_ignore_reason(obj, blacklist):
+      return "{}{}".format(get_ignore_reason(obj, blacklist), type(obj))
+    if tf_inspect.isframe(obj):
+      return "frame: {}".format(tf_inspect.getframeinfo(obj))
+    elif tf_inspect.ismodule(obj):
+      return "module: {}".format(obj.__name__)
+    else:
+      if leaves_only:
+        return "{}, {}".format(type(obj), id(obj))
+      elif isinstance(obj, list):
+        return "list({}): {}".format(
+            id(obj), [describe(e, blacklist, leaves_only=True) for e in obj])
+      elif isinstance(obj, tuple):
+        return "tuple({}): {}".format(
+            id(obj), [describe(e, blacklist, leaves_only=True) for e in obj])
+      elif isinstance(obj, dict):
+        return "dict({}): {} keys".format(id(obj), len(obj.keys()))
+      elif tf_inspect.isfunction(obj):
+        return "function({}) {}; globals ID: {}".format(
+            id(obj), obj.__name__, id(obj.__globals__))
+      else:
+        return "{}, {}".format(type(obj), id(obj))
+
+  def build_ref_graph(obj, graph, reprs, blacklist):
+    """Builds a reference graph as <referrer> -> <list of refferents>.
+
+    Args:
+      obj: The object to start from. The graph will be built by recursively
+        adding its referrers.
+      graph: Dict holding the graph to be built. To avoid creating extra
+        references, the graph holds object IDs rather than actual objects.
+      reprs: Auxiliary structure that maps object IDs to their human-readable
+        description.
+      blacklist: List of objects to ignore.
+    """
+    referrers = gc.get_referrers(obj)
+    blacklist = blacklist + (referrers,)
+
+    obj_id = id(obj)
+    for r in referrers:
+      if get_ignore_reason(r, blacklist) is None:
+        r_id = id(r)
+        if r_id not in graph:
+          graph[r_id] = []
+        if obj_id not in graph[r_id]:
+          graph[r_id].append(obj_id)
+          build_ref_graph(r, graph, reprs, blacklist)
+          reprs[r_id] = describe(r, blacklist)
+
+  def find_cycle(el, graph, reprs, path):
+    """Finds and prints a single cycle in the dependency graph."""
+    if el not in graph:
+      return
+    for r in graph[el]:
+      if r in path:
+        logging.error("Reference cycle sample:")
+        for p in path + (r,):
+          logging.error(reprs.get(p, "unknown object " + str(p)))
+        return True
+      else:
+        if find_cycle(r, graph, reprs, path + (r,)):
+          return True
+    return False
+
+  obj = objects[idx]
+  graph = {}  # referrer ID -> object ID
+  reprs = {}  # object ID -> description
+  build_ref_graph(obj, graph, reprs, (objects, graph, reprs, get_ignore_reason,
+                                      describe, build_ref_graph, find_cycle))
+  for k in graph:
+    if find_cycle(k, graph, reprs, ()):
+      return True
+  return False
+
+
 def assert_no_garbage_created(f):
   """Test method decorator to assert that no garbage has been created.
 
@@ -644,6 +748,10 @@ def assert_no_garbage_created(f):
 
   def decorator(self, **kwargs):
     """Sets DEBUG_SAVEALL, runs the test, and checks for new garbage."""
+    # Force-load `distribution_strategy_context` to prevent GC at
+    # test time when using eager. Remove once b/117329403 is resolved.
+    tape.distribution_strategy_context.get_distribution_strategy()
+
     gc.disable()
     previous_debug_flags = gc.get_debug()
     gc.set_debug(gc.DEBUG_SAVEALL)
@@ -651,7 +759,8 @@ def assert_no_garbage_created(f):
     previous_garbage = len(gc.garbage)
     f(self, **kwargs)
     gc.collect()
-    if len(gc.garbage) > previous_garbage:
+    new_garbage = len(gc.garbage)
+    if new_garbage > previous_garbage:
       logging.error(
           "The decorated test created work for Python's garbage collector, "
           "likely due to a reference cycle. New objects in cycle(s):")
@@ -675,11 +784,19 @@ def assert_no_garbage_created(f):
           logging.error(obj)
           logging.error("  Object __repr__:")
           logging.error(repr(obj))
-        except Exception:
+        except Exception:  # pylint: disable=broad-except
           logging.error("(Exception while printing object)")
+
+    # When garbage is created, this call can help identify reference cycles,
+    # which are typically the cause of such garbage.
+    if new_garbage > previous_garbage:
+      for i in range(previous_garbage, new_garbage):
+        if _find_reference_cycle(gc.garbage, i):
+          break
+
     # This will fail if any garbage has been created, typically because of a
     # reference cycle.
-    self.assertEqual(previous_garbage, len(gc.garbage))
+    self.assertEqual(previous_garbage, new_garbage)
     # TODO(allenl): Figure out why this debug flag reset doesn't work. It would
     # be nice to be able to decorate arbitrary tests in a large test suite and
     # not hold on to every object in other tests.
@@ -835,20 +952,20 @@ def run_in_graph_and_eager_modes(func=None,
           "`run_test_in_graph_and_eager_modes` only supports test methods. "
           "Did you mean to use `run_all_in_graph_and_eager_modes`?")
 
-    def decorated(self, **kwargs):
+    def decorated(self, *args, **kwargs):
       try:
         with context.graph_mode():
           with self.test_session(use_gpu=use_gpu, config=config):
-            f(self, **kwargs)
+            f(self, *args, **kwargs)
       except unittest.case.SkipTest:
         pass
 
       def run_eagerly(self, **kwargs):
         if not use_gpu:
           with ops.device("/device:CPU:0"):
-            f(self, **kwargs)
+            f(self, *args, **kwargs)
         else:
-          f(self, **kwargs)
+          f(self, *args, **kwargs)
 
       if assert_no_eager_garbage:
         ops.reset_default_graph()
@@ -1135,6 +1252,9 @@ class TensorFlowTestCase(googletest.TestCase):
       return self._eval_helper(tensor())
     else:
       try:
+        if sparse_tensor.is_sparse(tensor):
+          return sparse_tensor.SparseTensorValue(tensor.indices, tensor.values,
+                                                 tensor.dense_shape)
         return tensor.numpy()
       except AttributeError as e:
         six.raise_from(ValueError("Unsupported type %s." % type(tensor)), e)
@@ -1670,9 +1790,16 @@ class TensorFlowTestCase(googletest.TestCase):
     msg = msg if msg else ""
     a = self._GetNdArray(a)
     b = self._GetNdArray(b)
-    self.assertEqual(
-        a.shape, b.shape, "Shape mismatch: expected %s, got %s."
-        " %s" % (a.shape, b.shape, msg))
+    # Arbitrary bounds so that we don't print giant tensors.
+    if (b.ndim <= 3 or b.size < 500):
+      self.assertEqual(
+          a.shape, b.shape, "Shape mismatch: expected %s, got %s."
+          " Contents: %s. \n%s." % (a.shape, b.shape, b, msg))
+    else:
+      self.assertEqual(
+          a.shape, b.shape, "Shape mismatch: expected %s, got %s."
+          " %s" % (a.shape, b.shape, msg))
+
     same = (a == b)
 
     if (a.dtype in [
diff --git a/tensorflow/python/framework/test_util_test.py b/tensorflow/python/framework/test_util_test.py
index 79b1979925f..cbefe864814 100644
--- a/tensorflow/python/framework/test_util_test.py
+++ b/tensorflow/python/framework/test_util_test.py
@@ -24,6 +24,7 @@ import random
 import threading
 import weakref
 
+from absl.testing import parameterized
 import numpy as np
 
 from google.protobuf import text_format
@@ -46,7 +47,7 @@ from tensorflow.python.ops import variables
 from tensorflow.python.platform import googletest
 
 
-class TestUtilTest(test_util.TensorFlowTestCase):
+class TestUtilTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
   def test_assert_ops_in_graph(self):
     with self.test_session():
@@ -728,6 +729,12 @@ class TestUtilTest(test_util.TensorFlowTestCase):
     self.assertEqual(modes[0:2], ["setup_graph", "run_graph"])
     self.assertEqual(modes[2:], ["setup_eager", "run_eager"])
 
+  @parameterized.named_parameters(dict(testcase_name="argument",
+                                       arg=True))
+  @test_util.run_in_graph_and_eager_modes
+  def test_run_in_graph_and_eager_works_with_parameterized_keyword(self, arg):
+    self.assertEqual(arg, True)
+
 
 # Its own test case to reproduce variable sharing issues which only pop up when
 # setUp() is overridden and super() is not called.
diff --git a/tensorflow/python/keras/BUILD b/tensorflow/python/keras/BUILD
index dd4e11f0eeb..6f38d822e70 100755
--- a/tensorflow/python/keras/BUILD
+++ b/tensorflow/python/keras/BUILD
@@ -344,6 +344,7 @@ py_test(
     name = "convolutional_test",
     size = "large",
     srcs = ["layers/convolutional_test.py"],
+    shard_count = 4,
     srcs_version = "PY2AND3",
     deps = [
         ":keras",
@@ -368,7 +369,7 @@ cuda_py_test(
 
 py_test(
     name = "pooling_test",
-    size = "small",
+    size = "medium",
     srcs = ["layers/pooling_test.py"],
     srcs_version = "PY2AND3",
     deps = [
@@ -381,6 +382,7 @@ py_test(
     name = "core_test",
     size = "medium",
     srcs = ["layers/core_test.py"],
+    shard_count = 2,
     srcs_version = "PY2AND3",
     deps = [
         ":keras",
@@ -401,9 +403,11 @@ cuda_py_test(
 
 py_test(
     name = "local_test",
-    size = "large",
+    size = "medium",
     srcs = ["layers/local_test.py"],
+    shard_count = 2,
     srcs_version = "PY2AND3",
+    tags = ["no_windows"],
     deps = [
         ":keras",
         "//tensorflow/python:client_testlib",
@@ -690,8 +694,9 @@ py_test(
 
 py_test(
     name = "training_test",
-    size = "enormous",
+    size = "medium",
     srcs = ["engine/training_test.py"],
+    shard_count = 4,
     srcs_version = "PY2AND3",
     tags = ["notsan"],
     deps = [
@@ -824,6 +829,7 @@ py_test(
     name = "models_test",
     size = "medium",
     srcs = ["models_test.py"],
+    shard_count = 2,
     srcs_version = "PY2AND3",
     tags = ["notsan"],  # b/67509773
     deps = [
diff --git a/tensorflow/python/keras/backend.py b/tensorflow/python/keras/backend.py
index b1999d9566b..dd9b0c07e70 100644
--- a/tensorflow/python/keras/backend.py
+++ b/tensorflow/python/keras/backend.py
@@ -32,6 +32,7 @@ import numpy as np
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session as session_module
 from tensorflow.python.eager import context
+from tensorflow.python.eager import function as eager_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes as dtypes_module
 from tensorflow.python.framework import func_graph
@@ -694,7 +695,6 @@ def variable(value, dtype=None, name=None, constraint=None):
     v = sparse_tensor.SparseTensor(
         indices=indices, values=sparse_coo.data, dense_shape=sparse_coo.shape)
     v._keras_shape = sparse_coo.shape
-    v._uses_learning_phase = False
     return v
   v = resource_variable_ops.ResourceVariable(
       value,
@@ -705,7 +705,6 @@ def variable(value, dtype=None, name=None, constraint=None):
     v._keras_shape = value.shape
   elif hasattr(value, 'shape'):
     v._keras_shape = int_shape(value)
-  v._uses_learning_phase = False
   track_variable(v)
   return v
 
@@ -868,7 +867,6 @@ def placeholder(shape=None, ndim=None, dtype=None, sparse=False, name=None):
       x = array_ops.sparse_placeholder(dtype, shape=shape, name=name)
     else:
       x = array_ops.placeholder(dtype, shape=shape, name=name)
-  x._uses_learning_phase = False
   return x
 
 
@@ -1719,10 +1717,7 @@ def var(x, axis=None, keepdims=False):
   """
   if x.dtype.base_dtype == dtypes_module.bool:
     x = math_ops.cast(x, floatx())
-  m = math_ops.reduce_mean(x, axis, True)
-  devs_squared = math_ops.square(x - m)
-  return math_ops.reduce_mean(
-      devs_squared, axis, keepdims)
+  return math_ops.reduce_variance(x, axis=axis, keepdims=keepdims)
 
 
 @tf_export('keras.backend.std')
@@ -1740,7 +1735,9 @@ def std(x, axis=None, keepdims=False):
   Returns:
       A tensor with the standard deviation of elements of `x`.
   """
-  return math_ops.sqrt(var(x, axis=axis, keepdims=keepdims))
+  if x.dtype.base_dtype == dtypes_module.bool:
+    x = math_ops.cast(x, floatx())
+  return math_ops.reduce_std(x, axis=axis, keepdims=keepdims)
 
 
 @tf_export('keras.backend.mean')
@@ -2903,7 +2900,7 @@ def print_tensor(x, message=''):
 # GRAPH MANIPULATION
 
 
-class Function(object):
+class GraphExecutionFunction(object):
   """Runs a computation graph.
 
   It's possible to pass arguments to `tf.Session.run()` via `session_kwargs`.
@@ -2927,13 +2924,13 @@ class Function(object):
                **session_kwargs):
     updates = updates or []
     if not isinstance(inputs, (list, tuple)):
-      raise TypeError('`inputs` to a TensorFlow backend function '
+      raise TypeError('`inputs` to a Keras backend function '
                       'should be a list or tuple.')
     if not isinstance(outputs, (list, tuple)):
-      raise TypeError('`outputs` of a TensorFlow backend function '
+      raise TypeError('`outputs` of a Keras backend function '
                       'should be a list or tuple.')
     if not isinstance(updates, (list, tuple)):
-      raise TypeError('`updates` in a TensorFlow backend function '
+      raise TypeError('`updates` in a Keras backend function '
                       'should be a list or tuple.')
     self.inputs = list(inputs)
     self.outputs = list(outputs)
@@ -3080,14 +3077,102 @@ class Function(object):
     return fetched[:len(self.outputs)]
 
 
+class EagerExecutionFunction(object):
+  """Helper class for constructing a TF graph function from the Keras graph.
+
+  Arguments:
+    inputs: Feed placeholders to the computation graph.
+    outputs: Output tensors to fetch.
+    updates: Additional update ops to be run at function call.
+    name: A name to help users identify what this function does.
+    session_kwargs: Unsupported.
+  """
+
+  def __init__(self, inputs, outputs, updates=None, name=None):
+    updates = updates or []
+    if not isinstance(inputs, (list, tuple)):
+      raise TypeError('`inputs` to a Keras backend function '
+                      'should be a list or tuple.')
+    if not isinstance(outputs, (list, tuple)):
+      raise TypeError('`outputs` of a Keras backend function '
+                      'should be a list or tuple.')
+    if not isinstance(updates, (list, tuple)):
+      raise TypeError('`updates` in a Keras backend function '
+                      'should be a list or tuple.')
+    self.inputs = list(inputs)
+    self.outputs = list(outputs)
+    self.name = name
+
+    graph = get_graph()
+    # Consolidate updates
+    with graph.as_default():
+      with ops.control_dependencies(self.outputs):
+        # In general, updates should be run after the outputs have been
+        # computed. However, we can only ensure this when we create
+        # the updates here (i.e. when updates are passed as tuples).
+        # We cannot modify the control dependencies of preexisting update ops.
+        updates_ops = []
+        for update in updates:
+          # For legacy reasons it is allowed to pass an update as a tuple
+          # `(variable, new_value)` (this maps to an assign op).
+          if isinstance(update, tuple):
+            p, new_p = update
+            updates_ops.append(state_ops.assign(p, new_p))
+          else:
+            # Assumed already an op -- we cannot control its execution order.
+            updates_ops.append(update)
+
+      # We set the update ops to run at the end by conditioning it on output[0]
+      if updates and not outputs:
+        # Edge case; never happens in practice
+        raise ValueError('Cannot create a Keras backend function with updates'
+                         ' but no outputs during eager execution.')
+      with ops.control_dependencies(updates_ops):
+        outputs[0] = array_ops.identity(outputs[0])
+
+    # Prepare graph function
+    # TODO(fchollet): can we restrict `captures` to variables actually used in
+    # the relevant subgraph?
+    graph.inputs = inputs + list(graph.captures.values())
+    graph.outputs = outputs
+    graph_fn = eager_function.Function(graph)
+    graph_fn._num_positional_args = len(inputs)
+    graph_fn._arg_keywords = []
+    self._graph_fn = graph_fn
+
+    # Handle placeholders with default
+    # (treated as required placeholder by graph functions)
+    self._placeholder_default_values = {}
+    with graph.as_default():
+      for x in self.inputs:
+        if x.op.type == 'PlaceholderWithDefault':
+          self._placeholder_default_values[x] = tensor_util.constant_value(
+              x.op.inputs[0])
+
+  def __call__(self, inputs):
+    converted_inputs = []
+    for tensor, value in zip(self.inputs, inputs):
+      if value is None:
+        # Assume `value` is a placeholder with default
+        value = self._placeholder_default_values.get(tensor, None)
+        if value is None:
+          raise ValueError(
+              'You must feed a value for placeholder %s' % (tensor,))
+      converted_inputs.append(
+          ops.convert_to_tensor(value, dtype=tensor.dtype))
+    outputs = self._graph_fn(*converted_inputs)
+    return [x.numpy() for x in outputs]
+
+
 @tf_export('keras.backend.function')
-def function(inputs, outputs, updates=None, **kwargs):
+def function(inputs, outputs, updates=None, name=None, **kwargs):
   """Instantiates a Keras function.
 
   Arguments:
       inputs: List of placeholder tensors.
       outputs: List of output tensors.
       updates: List of update ops.
+      name: String, name of function.
       **kwargs: Passed to `tf.Session.run`.
 
   Returns:
@@ -3097,16 +3182,19 @@ def function(inputs, outputs, updates=None, **kwargs):
       ValueError: if invalid kwargs are passed in or if in eager execution.
   """
   if context.executing_eagerly():
-    raise ValueError(
-        '`keras.backend.function` is not supported with eager execution.')
+    if kwargs:
+      raise ValueError('Session keyword arguments are not support during '
+                       'eager execution. You passed: %s' % (kwargs,))
+    return EagerExecutionFunction(inputs, outputs, updates=updates, name=name)
+
   if kwargs:
     for key in kwargs:
       if (key not in tf_inspect.getfullargspec(session_module.Session.run)[0]
-          and key not in tf_inspect.getfullargspec(Function.__init__)[0]):
+          and key not in ['inputs', 'outputs', 'updates', 'name']):
         msg = ('Invalid argument "%s" passed to K.function with TensorFlow '
                'backend') % key
         raise ValueError(msg)
-  return Function(inputs, outputs, updates=updates, **kwargs)
+  return GraphExecutionFunction(inputs, outputs, updates=updates, **kwargs)
 
 
 @tf_export('keras.backend.gradients')
@@ -3238,9 +3326,6 @@ def rnn(step_function,
   if constants is None:
     constants = []
 
-  global uses_learning_phase  # pylint: disable=global-variable-undefined
-  uses_learning_phase = False
-
   # tf.where needs its condition tensor to be the same shape as its two
   # result tensors, but in our case the condition (mask) tensor is
   # (nsamples, 1), and inputs are (nsamples, ndimensions) or even more.
@@ -3292,9 +3377,6 @@ def rnn(step_function,
         inp = _get_input_tensor(i)
         mask_t = mask_list[i]
         output, new_states = step_function(inp, states + constants)
-        if getattr(output, '_uses_learning_phase', False):
-          uses_learning_phase = True
-
         tiled_mask_t = _expand_mask(mask_t, output)
 
         if not successive_outputs:
@@ -3319,8 +3401,6 @@ def rnn(step_function,
       for i in range(time_steps):
         inp = _get_input_tensor(i)
         output, states = step_function(inp, states + constants)
-        if getattr(output, '_uses_learning_phase', False):
-          uses_learning_phase = True
         successive_outputs.append(output)
         successive_states.append(states)
       last_output = successive_outputs[-1]
@@ -3362,6 +3442,13 @@ def rnn(step_function,
 
     time = constant_op.constant(0, dtype='int32', name='time')
 
+    while_loop_kwargs = {
+        'cond': lambda time, *_: time < time_steps_t,
+        'maximum_iterations': input_length,
+        'parallel_iterations': 32,
+        'swap_memory': True,
+    }
+
     if mask is not None:
       if not states:
         raise ValueError('No initial states provided! '
@@ -3379,16 +3466,21 @@ def rnn(step_function,
           tensor_array_name='mask_ta')
       mask_ta = mask_ta.unstack(mask)
 
-      def _step(time, output_ta_t, *states):
+      # Mask for the T output will be base on the output of T - 1. In the case
+      # T = 0, a zero filled tensor will be used.
+      flat_zero_output = tuple(array_ops.zeros_like(o)
+                               for o in nest.flatten(output_time_zero))
+      def _step(time, output_ta_t, prev_output, *states):
         """RNN step function.
 
         Arguments:
             time: Current timestep value.
             output_ta_t: TensorArray.
+            prev_output: tuple of outputs from time - 1.
             *states: List of states.
 
         Returns:
-            Tuple: `(time + 1,output_ta_t) + tuple(new_states)`
+            Tuple: `(time + 1, output_ta_t, output) + tuple(new_states)`
         """
         current_input = tuple(ta.read(time) for ta in input_ta)
         # maybe set shape.
@@ -3396,13 +3488,9 @@ def rnn(step_function,
         mask_t = mask_ta.read(time)
         output, new_states = step_function(current_input,
                                            tuple(states) + tuple(constants))
-        if getattr(output, '_uses_learning_phase', False):
-          global uses_learning_phase  # pylint: disable=global-variable-undefined
-          uses_learning_phase = True
-
+        # mask output
         flat_output = nest.flatten(output)
-        # This assume the state[0] is same shape as the output
-        flat_previous_output = nest.flatten(states[0])
+        flat_previous_output = nest.flatten(prev_output)
         tiled_mask_t = tuple(_expand_mask(mask_t, o) for o in flat_output)
         flat_new_output = tuple(
             array_ops.where(m, o, po) for m, o, po in zip(
@@ -3422,9 +3510,16 @@ def rnn(step_function,
         output_ta_t = tuple(
             ta.write(time, out)
             for ta, out in zip(output_ta_t, flat_new_output))
-        return (time + 1, output_ta_t) + tuple(new_states)
-    else:
+        return (time + 1, output_ta_t,
+                tuple(flat_new_output)) + tuple(new_states)
 
+      final_outputs = control_flow_ops.while_loop(
+          body=_step,
+          loop_vars=(time, output_ta, flat_zero_output) + states,
+          **while_loop_kwargs)
+      # Skip final_outputs[2] which is the output for final timestep.
+      new_states = final_outputs[3:]
+    else:
       def _step(time, output_ta_t, *states):
         """RNN step function.
 
@@ -3440,10 +3535,6 @@ def rnn(step_function,
         current_input = nest.pack_sequence_as(inputs, current_input)
         output, new_states = step_function(current_input,
                                            tuple(states) + tuple(constants))
-        if getattr(output, '_uses_learning_phase', False):
-          global uses_learning_phase  # pylint: disable=global-variable-undefined
-          uses_learning_phase = True
-
         flat_state = nest.flatten(states)
         flat_new_state = nest.flatten(new_states)
         for state, new_state in zip(flat_state, flat_new_state):
@@ -3452,25 +3543,21 @@ def rnn(step_function,
         flat_output = nest.flatten(output)
         output_ta_t = tuple(
             ta.write(time, out) for ta, out in zip(output_ta_t, flat_output))
+        new_states = nest.pack_sequence_as(initial_states, flat_new_state)
         return (time + 1, output_ta_t) + tuple(new_states)
 
-    final_outputs = control_flow_ops.while_loop(
-        cond=lambda time, *_: time < time_steps_t,
-        body=_step,
-        loop_vars=(time, output_ta) + states,
-        maximum_iterations=input_length,
-        parallel_iterations=32,
-        swap_memory=True)
+      final_outputs = control_flow_ops.while_loop(
+          body=_step,
+          loop_vars=(time, output_ta) + states,
+          **while_loop_kwargs)
+      new_states = final_outputs[2:]
+
     last_time = final_outputs[0]
     output_ta = final_outputs[1]
-    new_states = final_outputs[2:]
 
     outputs = tuple(o.stack() for o in output_ta)
     outputs = nest.pack_sequence_as(output_time_zero, outputs)
     last_output = tuple(o.read(last_time - 1) for o in output_ta)
-    if not context.executing_eagerly():
-      for o in last_output:
-        o._uses_learning_phase = uses_learning_phase
     last_output = nest.pack_sequence_as(output_time_zero, last_output)
 
   # static shape inference
@@ -3574,9 +3661,6 @@ def in_train_phase(x, alt, training=None):
   """
   if training is None:
     training = learning_phase()
-    uses_learning_phase = True
-  else:
-    uses_learning_phase = False
 
   if training is 1 or training is True:
     if callable(x):
@@ -3592,8 +3676,6 @@ def in_train_phase(x, alt, training=None):
 
   # else: assume learning phase is a placeholder tensor.
   x = switch(training, x, alt)
-  if uses_learning_phase:
-    x._uses_learning_phase = True
   return x
 
 
diff --git a/tensorflow/python/keras/backend_test.py b/tensorflow/python/keras/backend_test.py
index 4368b69ebe5..d8aa3e9b529 100644
--- a/tensorflow/python/keras/backend_test.py
+++ b/tensorflow/python/keras/backend_test.py
@@ -29,6 +29,7 @@ from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
@@ -1400,7 +1401,38 @@ class BackendGraphTests(test.TestCase):
     x = keras.backend.variable(1)
     self.assertEqual(keras.backend.is_placeholder(x), False)
 
+  @test_util.run_in_graph_and_eager_modes
+  def test_function_basics(self):
+    x1 = keras.backend.placeholder(shape=(), dtype='float32')
+    x2 = keras.backend.placeholder(shape=(), dtype='int32')
+    v = keras.backend.variable(10.)
+    with keras.backend.get_graph().as_default():
+      y1 = x1 + keras.backend.cast(x2, 'float32') + v
+      y2 = x1 * keras.backend.cast(x2, 'float32')
+      with ops.control_dependencies([y1]):
+        u = keras.backend.update(v, 5.)
+    f = keras.backend.function([x1, x2], [y1, y2], updates=[u])
+    output_values = f([2, 3])
+    self.assertEqual(output_values, [15., 6.])
+    self.assertEqual(keras.backend.eval(v), 5.)
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_function_placeholder_with_default(self):
+    with keras.backend.get_graph().as_default():
+      x1 = array_ops.placeholder_with_default(
+          np.array(2., dtype='float32'), shape=())
+      x2 = array_ops.placeholder_with_default(
+          np.array(3, dtype='int32'), shape=())
+    y1 = x1 + keras.backend.cast(x2, 'float32')
+    y2 = x1 * keras.backend.cast(x2, 'float32')
+    f = keras.backend.function([x1, x2], [y1, y2])
+    output_values = f([4, 5])
+    self.assertEqual(output_values, [9., 20.])
+    output_values = f([None, None])
+    self.assertEqual(output_values, [5., 6.])
+
   def test_function_tf_feed_symbols(self):
+    # Test Keras backend functions with TF tensor inputs.
     with self.cached_session():
       # Test feeding a resource variable to `function`.
       x1 = keras.backend.placeholder(shape=())
diff --git a/tensorflow/python/keras/callbacks.py b/tensorflow/python/keras/callbacks.py
index 4bdab56eb44..fde17cb6bc4 100644
--- a/tensorflow/python/keras/callbacks.py
+++ b/tensorflow/python/keras/callbacks.py
@@ -19,9 +19,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from collections import deque
-from collections import Iterable
-from collections import OrderedDict
+import collections
 import copy
 import csv
 import io
@@ -68,7 +66,8 @@ def configure_callbacks(callbacks,
                         samples=None,
                         validation_steps=None,
                         verbose=1,
-                        count_mode='steps'):
+                        count_mode='steps',
+                        mode='train'):
   """Configures callbacks for use in various training loops.
 
   Arguments:
@@ -88,21 +87,30 @@ def configure_callbacks(callbacks,
       validation_steps: Number of batches to run per validation epoch.
       verbose: int, 0 or 1. Keras logging verbosity to pass to ProgbarLogger.
       count_mode: One of 'steps' or 'samples'. Per-batch or per-sample count.
+      mode: String. One of 'train', 'test', or 'predict'. Which loop mode to
+        configure callbacks for.
 
   Returns:
       Instance of CallbackList used to control all Callbacks.
   """
+  # Check if callbacks have already been configured.
+  if isinstance(callbacks, CallbackList):
+    return callbacks
 
-  # Add additional callbacks
-  model.history = History()
-  stateful_metric_names = None
-  if hasattr(model, 'metrics_names'):
-    stateful_metric_names = model.metrics_names[1:]  # Exclude `loss`
-  callbacks = [BaseLogger(stateful_metrics=stateful_metric_names)
-              ] + (callbacks or []) + [model.history]
-  if verbose:
-    callbacks.append(
-        ProgbarLogger(count_mode, stateful_metrics=stateful_metric_names))
+  if not callbacks:
+    callbacks = []
+
+  # Add additional callbacks during training.
+  if mode == 'train':
+    model.history = History()
+    stateful_metric_names = None
+    if hasattr(model, 'metrics_names'):
+      stateful_metric_names = model.metrics_names[1:]  # Exclude `loss`
+    callbacks = [BaseLogger(stateful_metrics=stateful_metric_names)
+                ] + (callbacks or []) + [model.history]
+    if verbose:
+      callbacks.append(
+          ProgbarLogger(count_mode, stateful_metrics=stateful_metric_names))
   callback_list = CallbackList(callbacks)
 
   # Set callback model
@@ -118,7 +126,7 @@ def configure_callbacks(callbacks,
   callback_metrics = []
   # When we have deferred build scenario with iterator input, we will compile
   # when we standardize first batch of data.
-  if model._is_compiled:  # pylint: disable=protected-access
+  if mode != 'predict' and model._is_compiled:  # pylint: disable=protected-access
     callback_metrics = copy.copy(model.metrics_names)
     if do_validation:
       callback_metrics += ['val_' + n for n in model.metrics_names]
@@ -137,6 +145,7 @@ def configure_callbacks(callbacks,
   callback_list.set_params(callback_params)
 
   # Pass validation data to callbacks
+  # TODO(omalleyt): remove this once val hooks are ready.
   if not val_inputs:
     val_data = []
   elif _is_generator_like(val_inputs):
@@ -145,7 +154,7 @@ def configure_callbacks(callbacks,
     val_data = val_inputs + val_targets
     if val_sample_weights:
       val_data += val_sample_weights
-    if model.uses_learning_phase and not isinstance(K.learning_phase(), int):
+    if not isinstance(K.learning_phase(), int):
       val_data += [0.]
   for cbk in callbacks:
     cbk.validation_data = val_data
@@ -175,6 +184,12 @@ class CallbackList(object):
     self.queue_length = queue_length
     self.params = {}
     self.model = None
+    self._reset_batch_timing()
+
+  def _reset_batch_timing(self):
+    self._delta_t_batch = 0.
+    self._delta_ts = collections.defaultdict(
+        lambda: collections.deque([], maxlen=self.queue_length))
 
   def append(self, callback):
     self.callbacks.append(callback)
@@ -189,72 +204,96 @@ class CallbackList(object):
     for callback in self.callbacks:
       callback.set_model(model)
 
-  def on_epoch_begin(self, epoch, logs=None):
+  def _call_batch_hook(self, mode, hook, batch, logs=None):
+    """Helper function for all batch_{begin | end} methods."""
+    # TODO(omalleyt): add batch hooks for test/predict.
+    if mode != 'train':
+      return
+
+    hook_name = 'on_{mode}_batch_{hook}'.format(mode=mode, hook=hook)
+    if hook == 'begin':
+      self._t_enter_batch = time.time()
+    if hook == 'end':
+      # Batch is ending, calculate batch time.
+      self._delta_t_batch = time.time() - self._t_enter_batch
+
+    logs = logs or {}
+    t_before_callbacks = time.time()
+    for callback in self.callbacks:
+      batch_hook = getattr(callback, hook_name)
+      batch_hook(batch, logs)
+    self._delta_ts[hook_name].append(time.time() - t_before_callbacks)
+
+    delta_t_median = np.median(self._delta_ts[hook_name])
+    if (self._delta_t_batch > 0. and
+        delta_t_median > 0.95 * self._delta_t_batch and delta_t_median > 0.1):
+      logging.warning(
+          'Method (%s) is slow compared '
+          'to the batch update (%f). Check your callbacks.', hook_name,
+          delta_t_median)
+
+  def _call_begin_hook(self, mode):
+    """Helper function for on_{train|test|predict}_begin methods."""
+    # TODO(omalleyt): add test/predict methods.
+    if mode == 'train':
+      self.on_train_begin()
+
+  def _call_end_hook(self, mode):
+    """Helper function for on_{train|test|predict}_end methods."""
+    # TODO(omalleyt): add test/predict methods.
+    if mode == 'train':
+      self.on_train_end()
+
+  def on_batch_begin(self, batch, logs=None):
+    self._call_batch_hook('train', 'begin', batch, logs=logs)
+
+  def on_batch_end(self, batch, logs=None):
+    self._call_batch_hook('train', 'end', batch, logs=logs)
+
+  def on_epoch_begin(self, epoch, logs=None, mode='train'):
     """Called at the start of an epoch.
 
     Arguments:
         epoch: integer, index of epoch.
         logs: dictionary of logs.
+        mode: One of 'train'/'test'/'predict'
     """
-    logs = logs or {}
-    for callback in self.callbacks:
-      callback.on_epoch_begin(epoch, logs)
-    self._delta_t_batch = 0.
-    self._delta_ts_batch_begin = deque([], maxlen=self.queue_length)
-    self._delta_ts_batch_end = deque([], maxlen=self.queue_length)
+    if mode == 'train':
+      logs = logs or {}
+      for callback in self.callbacks:
+        callback.on_epoch_begin(epoch, logs)
+    self._reset_batch_timing()
 
-  def on_epoch_end(self, epoch, logs=None):
+  def on_epoch_end(self, epoch, logs=None, mode='train'):
     """Called at the end of an epoch.
 
     Arguments:
         epoch: integer, index of epoch.
         logs: dictionary of logs.
+        mode: One of 'train'/'test'/'predict'
     """
-    logs = logs or {}
-    for callback in self.callbacks:
-      callback.on_epoch_end(epoch, logs)
+    if mode == 'train':
+      logs = logs or {}
+      for callback in self.callbacks:
+        callback.on_epoch_end(epoch, logs)
 
-  def on_batch_begin(self, batch, logs=None):
-    """Called right before processing a batch.
+  def on_train_batch_begin(self, batch, logs=None):
+    """Called at the beginning of a training batch in `fit` methods.
 
     Arguments:
         batch: integer, index of batch within the current epoch.
         logs: dictionary of logs.
     """
-    logs = logs or {}
-    t_before_callbacks = time.time()
-    for callback in self.callbacks:
-      callback.on_batch_begin(batch, logs)
-    self._delta_ts_batch_begin.append(time.time() - t_before_callbacks)
-    delta_t_median = np.median(self._delta_ts_batch_begin)
-    if (self._delta_t_batch > 0. and
-        delta_t_median > 0.95 * self._delta_t_batch and delta_t_median > 0.1):
-      logging.warning('Method on_batch_begin() is slow compared '
-                      'to the batch update (%f). Check your callbacks.',
-                      delta_t_median)
-    self._t_enter_batch = time.time()
+    self._call_batch_hook('train', 'begin', batch, logs=logs)
 
-  def on_batch_end(self, batch, logs=None):
-    """Called at the end of a batch.
+  def on_train_batch_end(self, batch, logs=None):
+    """Called at the end of a training batch in `fit` methods.
 
     Arguments:
         batch: integer, index of batch within the current epoch.
         logs: dictionary of logs.
     """
-    logs = logs or {}
-    if not hasattr(self, '_t_enter_batch'):
-      self._t_enter_batch = time.time()
-    self._delta_t_batch = time.time() - self._t_enter_batch
-    t_before_callbacks = time.time()
-    for callback in self.callbacks:
-      callback.on_batch_end(batch, logs)
-    self._delta_ts_batch_end.append(time.time() - t_before_callbacks)
-    delta_t_median = np.median(self._delta_ts_batch_end)
-    if (self._delta_t_batch > 0. and
-        (delta_t_median > 0.95 * self._delta_t_batch and delta_t_median > 0.1)):
-      logging.warning('Method on_batch_end() is slow compared '
-                      'to the batch update (%f). Check your callbacks.',
-                      delta_t_median)
+    self._call_batch_hook('train', 'end', batch, logs=logs)
 
   def on_train_begin(self, logs=None):
     """Called at the beginning of training.
@@ -330,6 +369,14 @@ class Callback(object):
   def on_batch_end(self, batch, logs=None):
     pass
 
+  def on_train_batch_begin(self, batch, logs=None):
+    # For backwards compatibility
+    self.on_batch_begin(batch, logs=logs)
+
+  def on_train_batch_end(self, batch, logs=None):
+    # For backwards compatibility
+    self.on_batch_end(batch, logs=logs)
+
   def on_train_begin(self, logs=None):
     pass
 
@@ -432,18 +479,19 @@ class ProgbarLogger(Callback):
     self.epochs = self.params['epochs']
 
   def on_epoch_begin(self, epoch, logs=None):
+    self.seen = 0
+    if self.use_steps:
+      self.target = self.params['steps']
+    else:
+      self.target = self.params['samples']
+
     if self.verbose:
-      print('Epoch %d/%d' % (epoch + 1, self.epochs))
-      if self.use_steps:
-        target = self.params['steps']
-      else:
-        target = self.params['samples']
-      self.target = target
+      if self.epochs > 1:
+        print('Epoch %d/%d' % (epoch + 1, self.epochs))
       self.progbar = Progbar(
           target=self.target,
           verbose=self.verbose,
           stateful_metrics=self.stateful_metrics)
-    self.seen = 0
 
   def on_batch_begin(self, batch, logs=None):
     if self.seen < self.target:
@@ -1191,7 +1239,7 @@ class TensorBoard(Callback):
 
           feed_dict.update({self.batch_id: i, self.step: step})
 
-          if self.model.uses_learning_phase:
+          if not isinstance(K.learning_phase(), int):
             feed_dict[K.learning_phase()] = False
 
           self.sess.run(self.assign_embeddings, feed_dict=feed_dict)
@@ -1385,7 +1433,7 @@ class CSVLogger(Callback):
       is_zero_dim_ndarray = isinstance(k, np.ndarray) and k.ndim == 0
       if isinstance(k, six.string_types):
         return k
-      elif isinstance(k, Iterable) and not is_zero_dim_ndarray:
+      elif isinstance(k, collections.Iterable) and not is_zero_dim_ndarray:
         return '"[%s]"' % (', '.join(map(str, k)))
       else:
         return k
@@ -1413,7 +1461,7 @@ class CSVLogger(Callback):
       if self.append_header:
         self.writer.writeheader()
 
-    row_dict = OrderedDict({'epoch': epoch})
+    row_dict = collections.OrderedDict({'epoch': epoch})
     row_dict.update((key, handle_value(logs[key])) for key in self.keys)
     self.writer.writerow(row_dict)
     self.csv_file.flush()
diff --git a/tensorflow/python/keras/callbacks_test.py b/tensorflow/python/keras/callbacks_test.py
index 22efa7a3781..9d9ede22c01 100644
--- a/tensorflow/python/keras/callbacks_test.py
+++ b/tensorflow/python/keras/callbacks_test.py
@@ -672,8 +672,8 @@ class KerasCallbacksTest(test.TestCase):
           callbacks=cbks,
           epochs=20)
       loss = history.history['loss']
-      assert len(loss) == 1
-      assert loss[0] == np.inf
+      self.assertEqual(len(loss), 1)
+      self.assertEqual(loss[0], np.inf)
 
   def test_TensorBoard(self):
     np.random.seed(1337)
diff --git a/tensorflow/python/keras/engine/base_layer.py b/tensorflow/python/keras/engine/base_layer.py
index 7870192638c..23419ae1503 100644
--- a/tensorflow/python/keras/engine/base_layer.py
+++ b/tensorflow/python/keras/engine/base_layer.py
@@ -792,7 +792,6 @@ class Layer(checkpointable.CheckpointableBase):
                              '(layer: ' + self.name + ').')
           self._handle_activity_regularization(inputs, outputs)
           self._set_mask_metadata(inputs, outputs, previous_mask)
-          self._set_learning_phase_metadata(inputs, outputs)
           if have_all_keras_metadata(inputs):
             inputs, outputs = self._set_connectivity_metadata_(
                 inputs, outputs, args, kwargs)
@@ -831,23 +830,6 @@ class Layer(checkpointable.CheckpointableBase):
     """
     return self.__call__(inputs, *args, **kwargs)
 
-  def _set_learning_phase_metadata(self, inputs, outputs):
-    # Update learning phase info. To work with subclassed models,
-    # this should be done even if Keras metadata is absent.
-    output_tensors = generic_utils.to_list(outputs)
-    uses_lp = any(
-        [getattr(x, '_uses_learning_phase', False)
-         for x in generic_utils.to_list(inputs)])
-    uses_lp = getattr(self, 'uses_learning_phase', False) or uses_lp
-    for i in range(len(output_tensors)):
-      try:
-        output_tensors[i]._uses_learning_phase = getattr(
-            output_tensors[i], '_uses_learning_phase', False) or uses_lp
-      except AttributeError:
-        # An output element happens to be a C type (such as tuple or dict).
-        # We don't track learning phase info in such edge cases.
-        pass
-
   def _set_mask_metadata(self, inputs, outputs, previous_mask):
     # In some cases the mask of the outputs has already been computed by
     # inner layers and does not need to be recomputed by this layer.
diff --git a/tensorflow/python/keras/engine/base_layer_test.py b/tensorflow/python/keras/engine/base_layer_test.py
index bda26dabcc6..704589349a8 100644
--- a/tensorflow/python/keras/engine/base_layer_test.py
+++ b/tensorflow/python/keras/engine/base_layer_test.py
@@ -121,6 +121,66 @@ class BaseLayerTest(test.TestCase):
     with self.assertRaisesRegexp(ValueError, 'You did something wrong!'):
       model.train_on_batch(np.random.random((2, 3)), np.random.random((2, 3)))
 
+  def test_using_symbolic_tensors_with_tf_ops(self):
+    # Single-input.
+    x = keras.Input((3,))
+    y = math_ops.square(x)
+    self.assertEqual(y.graph, keras.backend.get_graph())
+
+    # Multi-inputs.
+    x1, x2 = keras.Input((3,)), keras.Input((3,))
+    y = array_ops.concat([x1, x2], axis=1)
+    self.assertEqual(y.graph, keras.backend.get_graph())
+
+    # Mixing Keras symbolic tensors and graph tensors from the same graph works.
+    with keras.backend.get_graph().as_default():
+      x1 = keras.Input((3,))
+    x2 = keras.Input((3,))
+    y = math_ops.matmul(x1, x2)
+    self.assertEqual(y.graph, keras.backend.get_graph())
+
+    # Creating same op type (matmul) multiple times in the Keras graph works.
+    x1 = keras.Input((3,))
+    x2 = keras.Input((3,))
+    y = math_ops.matmul(x1, x2)
+    self.assertEqual(y.graph, keras.backend.get_graph())
+
+  def test_mixing_eager_and_graph_tensors(self):
+    with ops.Graph().as_default():
+      x1 = array_ops.ones((3, 3))
+    x2 = array_ops.ones((3, 3))
+    self.assertTrue(isinstance(x2, ops.EagerTensor))
+    with self.assertRaisesRegexp(TypeError,
+                                 'provided list of inputs contains '
+                                 'objects other than \'EagerTensor\''):
+      math_ops.matmul(x1, x2)
+
+  def test_mixing_numpy_arrays_and_graph_tensors(self):
+    with ops.Graph().as_default():
+      x1 = array_ops.ones((3, 3))
+    x2 = np.ones((3, 3), dtype='float32')
+    with self.assertRaisesRegexp(TypeError,
+                                 'provided list of inputs contains '
+                                 'objects other than \'EagerTensor\''):
+      math_ops.matmul(x1, x2)
+
+  def test_mixing_keras_symbolic_tensors_and_eager_tensors(self):
+    x1 = keras.Input((3,))
+    x2 = array_ops.ones((3, 3))
+    with self.assertRaisesRegexp(
+        TypeError,
+        'mix computation of symbolic Tensors'):
+      math_ops.matmul(x1, x2)
+
+  def test_mixing_keras_symbolic_tensors_and_numpy_arrays(self):
+    # For the time being we treat Numpy arrays as EagerTensors when mixing both.
+    x1 = keras.Input((3,))
+    x2 = np.ones((3, 3), dtype='float32')
+    with self.assertRaisesRegexp(
+        TypeError,
+        'mix computation of symbolic Tensors'):
+      math_ops.matmul(x1, x2)
+
 
 if __name__ == '__main__':
   ops.enable_eager_execution()
diff --git a/tensorflow/python/keras/engine/distributed_training_utils.py b/tensorflow/python/keras/engine/distributed_training_utils.py
index ec553db2f8e..f939b7565a8 100644
--- a/tensorflow/python/keras/engine/distributed_training_utils.py
+++ b/tensorflow/python/keras/engine/distributed_training_utils.py
@@ -368,17 +368,14 @@ def get_input_batch_params(first_x_value, batch_size, distribution_strategy):
   if not num_batches:
     raise ValueError('Please specify a batch_size that is smaller than'
                      'the number of input samples %d.' % first_x_value.shape[0])
-  # TODO(anjalisridhar): TPU currently supports using the num_replicas property.
-  # We might want to look into implementing worker_devices. In multi worker
-  # strategy, perhaps num_replicas works better?
-  steps = num_batches // distribution_strategy.num_replicas
+  steps = num_batches // distribution_strategy.num_replicas_in_sync
   if not steps:
     # TODO(anjalisridhar): Number of replicas in the error message may not
     # convey what we want to the user. Is there another terminology that we can
     # use that is consistent across different strategies?
     raise ValueError('The number of batches %d is smaller than the number '
                      'of replicas %d used for DistributionStrategy. ' %
-                     (num_batches, distribution_strategy.num_replicas))
+                     (num_batches, distribution_strategy.num_replicas_in_sync))
   return steps
 
 
@@ -496,7 +493,7 @@ def _get_var_for_numpy(distribution_strategy, input_array):
                                 input_var.dtype.size
 
   # Calculate number of elements we want to copy per slice.
-  batch_size_per_slice = np.ceil((64 << 20) / byte_size_per_batch_element)
+  batch_size_per_slice = int(np.ceil((64 << 20) / byte_size_per_batch_element))
 
   # Copy slices of the above size starting at 0, except the last slice will be
   # smaller.
diff --git a/tensorflow/python/keras/engine/input_layer.py b/tensorflow/python/keras/engine/input_layer.py
index 6f5d1fa7cfb..4e96106004f 100644
--- a/tensorflow/python/keras/engine/input_layer.py
+++ b/tensorflow/python/keras/engine/input_layer.py
@@ -194,6 +194,16 @@ def Input(  # pylint: disable=invalid-name
       model = Model(x, y)
       ```
 
+      Note that even if eager execution is enabled,
+      `Input` produces a symbolic tensor (i.e. a placeholder).
+      This symbolic tensor can be used with other
+      TensorFlow ops, as such:
+
+      ```python
+      x = Input(shape=(32,))
+      y = tf.square(x)
+      ```
+
   Raises:
     ValueError: in case of invalid arguments.
   """
diff --git a/tensorflow/python/keras/engine/network.py b/tensorflow/python/keras/engine/network.py
index 76a164da994..9b58180e3d3 100644
--- a/tensorflow/python/keras/engine/network.py
+++ b/tensorflow/python/keras/engine/network.py
@@ -449,11 +449,6 @@ class Network(base_layer.Layer):
           'assign variables to attributes and they will show up in the weights '
           'and variables properties.')
 
-  @property
-  def uses_learning_phase(self):
-    return any(
-        [getattr(x, '_uses_learning_phase', False) for x in self.outputs])
-
   @property
   def stateful(self):
     return any([(hasattr(layer, 'stateful') and layer.stateful)
diff --git a/tensorflow/python/keras/engine/sequential_test.py b/tensorflow/python/keras/engine/sequential_test.py
index 1401c1ed996..ea8fdf675a0 100644
--- a/tensorflow/python/keras/engine/sequential_test.py
+++ b/tensorflow/python/keras/engine/sequential_test.py
@@ -385,6 +385,25 @@ class TestSequentialEagerIntegration(test.TestCase):
     model.pop()
     self.assertTrue(model._can_use_graph_functions)
 
+  @tf_test_util.run_in_graph_and_eager_modes
+  def test_sequential_model_fails_with_dict_inputs(self):
+    num_classes = 5
+    model = testing_utils.get_small_sequential_mlp(
+        num_hidden=10, num_classes=num_classes)
+    model.compile(
+        rmsprop.RMSPropOptimizer(learning_rate=0.001),
+        metrics=['acc'],
+        weighted_metrics=['mae'],
+        loss='categorical_crossentropy')
+
+    x = {'dense_input': np.random.random((10, 1))}
+    y = np.random.randint(num_classes, size=(10, 1))
+
+    with self.assertRaisesRegexp(
+        ValueError, 'Passing a dictionary input to a Sequential Model which '
+        'doesn\'t have FeatureLayer as the first layer is an error'):
+      model.fit(x, y, batch_size=5, epochs=1)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/engine/topology_test.py b/tensorflow/python/keras/engine/topology_test.py
index 298044c5df2..b4a4babf259 100644
--- a/tensorflow/python/keras/engine/topology_test.py
+++ b/tensorflow/python/keras/engine/topology_test.py
@@ -341,62 +341,24 @@ class TopologyConstructionTest(test.TestCase):
     self.assertListEqual(model.trainable_weights, [])
     self.assertListEqual(model.non_trainable_weights, weights)
 
-  def test_learning_phase(self):
-    with self.cached_session():
-      a = keras.layers.Input(shape=(32,), name='input_a')
-      b = keras.layers.Input(shape=(32,), name='input_b')
-
-      a_2 = keras.layers.Dense(16, name='dense_1')(a)
-      dp = keras.layers.Dropout(0.5, name='dropout')
-      b_2 = dp(b)
-
-      self.assertFalse(a_2._uses_learning_phase)
-      self.assertTrue(b_2._uses_learning_phase)
-
-      # test merge
-      m = keras.layers.concatenate([a_2, b_2])
-      self.assertTrue(m._uses_learning_phase)
-
-      # Test recursion
-      model = keras.models.Model([a, b], [a_2, b_2])
-      self.assertTrue(model.uses_learning_phase)
-
-      c = keras.layers.Input(shape=(32,), name='input_c')
-      d = keras.layers.Input(shape=(32,), name='input_d')
-
-      c_2, b_2 = model([c, d])
-      self.assertTrue(c_2._uses_learning_phase)
-      self.assertTrue(b_2._uses_learning_phase)
-
-      # try actually running graph
-      fn = keras.backend.function(
-          model.inputs + [keras.backend.learning_phase()], model.outputs)
-      input_a_np = np.random.random((10, 32))
-      input_b_np = np.random.random((10, 32))
-      fn_outputs_no_dp = fn([input_a_np, input_b_np, 0])
-      fn_outputs_dp = fn([input_a_np, input_b_np, 1])
-      # output a: nothing changes
-      self.assertEqual(fn_outputs_no_dp[0].sum(), fn_outputs_dp[0].sum())
-      # output b: dropout applied
-      self.assertNotEqual(fn_outputs_no_dp[1].sum(), fn_outputs_dp[1].sum())
-
   def test_layer_call_arguments(self):
     # Test the ability to pass and serialize arguments to `call`.
     inp = keras.layers.Input(shape=(2,))
     x = keras.layers.Dense(3)(inp)
     x = keras.layers.Dropout(0.5)(x, training=True)
     model = keras.models.Model(inp, x)
-    self.assertFalse(model.uses_learning_phase)
+    # Would be `dropout/cond/Merge` by default
+    self.assertTrue(model.output.op.name.endswith('dropout/mul'))
 
     # Test that argument is kept when applying the model
     inp2 = keras.layers.Input(shape=(2,))
     out2 = model(inp2)
-    self.assertFalse(out2._uses_learning_phase)
+    self.assertTrue(out2.op.name.endswith('dropout/mul'))
 
     # Test that argument is kept after loading a model
     config = model.get_config()
     model = keras.models.Model.from_config(config)
-    self.assertFalse(model.uses_learning_phase)
+    self.assertTrue(model.output.op.name.endswith('dropout/mul'))
 
   def test_node_construction(self):
     # test basics
diff --git a/tensorflow/python/keras/engine/training.py b/tensorflow/python/keras/engine/training.py
index 1847a6a3897..cb96e3e5d20 100644
--- a/tensorflow/python/keras/engine/training.py
+++ b/tensorflow/python/keras/engine/training.py
@@ -721,7 +721,7 @@ class Model(Network):
       inputs = (self._feed_inputs +
                 self._feed_targets +
                 self._feed_sample_weights)
-      if self.uses_learning_phase and not isinstance(K.learning_phase(), int):
+      if not isinstance(K.learning_phase(), int):
         inputs += [K.learning_phase()]
 
       with K.name_scope('training'):
@@ -766,7 +766,7 @@ class Model(Network):
       inputs = (self._feed_inputs +
                 self._feed_targets +
                 self._feed_sample_weights)
-      if self.uses_learning_phase and not isinstance(K.learning_phase(), int):
+      if not isinstance(K.learning_phase(), int):
         inputs += [K.learning_phase()]
       updates = self.state_updates
       # Add stateful metrics updates.
@@ -794,7 +794,7 @@ class Model(Network):
     if not hasattr(self, 'predict_function'):
       self.predict_function = None
     if self.predict_function is None:
-      if self.uses_learning_phase and not isinstance(K.learning_phase(), int):
+      if not isinstance(K.learning_phase(), int):
         inputs = self._feed_inputs + [K.learning_phase()]
       else:
         inputs = self._feed_inputs
@@ -808,6 +808,17 @@ class Model(Network):
           name='predict_function',
           **kwargs)
 
+  def _get_execution_function(self, mode):
+    if mode == 'train':
+      self._make_fit_function()
+      return self._fit_function
+    if mode == 'test':
+      self._make_eval_function()
+      return self._eval_function
+    if mode == 'predict':
+      self._make_predict_function()
+      return self.predict_function
+
   def _get_iterator_get_next_tensors(self, iterator):
     get_next_op = self._iterator_get_next.get(iterator, None)
     if get_next_op is None:
@@ -882,7 +893,7 @@ class Model(Network):
       x_shape = first_x_value.shape
       if batch_size is None:
         batch_size = distributed_training_utils.get_batch_size(
-            self._distribution_strategy.num_replicas, x_shape[0], steps)
+            self._distribution_strategy.num_replicas_in_sync, x_shape[0], steps)
       # We need to use the drop_remainder argument to allow for a static
       # input shape which is required for TPUs.
       drop_remainder = self._distribution_strategy.require_static_shapes
@@ -1310,8 +1321,8 @@ class Model(Network):
         # We assert that the first layer is a FeatureLayer.
         if not training_utils.is_feature_layer(self.layers[0]):
           raise ValueError('Passing a dictionary input to a Sequential Model '
-                           'which doesnt have FeatureLayer as the first layer '
-                           'is an error')
+                           'which doesn\'t have FeatureLayer as the first layer'
+                           ' is an error.')
         input_shape = (None,)
         self.build(input_shape=input_shape)
       else:
@@ -1641,7 +1652,9 @@ class Model(Network):
           validation_steps=validation_steps)
     else:
       return training_arrays.fit_loop(
-          self, x, y,
+          self,
+          x,
+          y,
           sample_weights=sample_weights,
           batch_size=batch_size,
           epochs=epochs,
@@ -1861,12 +1874,6 @@ class Model(Network):
       batch_size = 32
 
     if self._distribution_strategy:
-      # Turn off prefetching since this is currently not deterministic. Once
-      # b/112498930 is fixed we can turn it back on.
-      # `_prefetch_on_device` is currently a property of only
-      # `MirroredStrategy`.
-      if hasattr(self._distribution_strategy, '_prefetch_on_device'):
-        self._distribution_strategy._prefetch_on_device = False  # pylint: disable=protected-access
       distributed_training_utils.validate_inputs(
           x, None, self._distribution_strategy)
       first_x_value = nest.flatten(x)[0]
@@ -1886,9 +1893,6 @@ class Model(Network):
     elif self._distribution_strategy:
       results = training_distributed.predict_loop(
           self, x, verbose=verbose, steps=steps)
-      # Turn prefetching back on since we turned it off previously.
-      if hasattr(self._distribution_strategy, '_prefetch_on_device'):
-        self._distribution_strategy._prefetch_on_device = True  # pylint: disable=protected-access
       return results
     else:
       return training_arrays.predict_loop(
@@ -1900,32 +1904,28 @@ class Model(Network):
     Arguments:
         x: Input data. It could be:
           - A Numpy array (or array-like), or a list of arrays
-            (in case the model has multiple inputs).
+              (in case the model has multiple inputs).
           - A TensorFlow tensor, or a list of tensors
-            (in case the model has multiple inputs).
+              (in case the model has multiple inputs).
           - A dict mapping input names to the corresponding array/tensors,
-            if the model has named inputs.
+              if the model has named inputs.
           - A `tf.data` dataset or a dataset iterator.
-        y: Target data. Like the input data `x`,
-          it could be either Numpy array(s) or TensorFlow tensor(s).
-          It should be consistent with `x` (you cannot have Numpy inputs and
-          tensor targets, or inversely). If `x` is a dataset or a
-          dataset iterator, `y` should not be specified
+        y: Target data. Like the input data `x`, it could be either Numpy
+          array(s) or TensorFlow tensor(s). It should be consistent with `x`
+          (you cannot have Numpy inputs and tensor targets, or inversely). If
+          `x` is a dataset or a dataset iterator, `y` should not be specified
           (since targets will be obtained from the iterator).
         sample_weight: Optional array of the same length as x, containing
-            weights to apply to the model's loss for each sample.
-            In the case of temporal data, you can pass a 2D array
-            with shape (samples, sequence_length),
-            to apply a different weight to every timestep of every sample.
-            In this case you should make sure to specify
-            sample_weight_mode="temporal" in compile(). This argument is not
-            supported when `x` is a dataset or a dataset iterator.
-        class_weight: Optional dictionary mapping
-            class indices (integers) to
-            a weight (float) to apply to the model's loss for the samples
-            from this class during training.
-            This can be useful to tell the model to "pay more attention" to
-            samples from an under-represented class.
+          weights to apply to the model's loss for each sample. In the case of
+          temporal data, you can pass a 2D array with shape (samples,
+          sequence_length), to apply a different weight to every timestep of
+          every sample. In this case you should make sure to specify
+          sample_weight_mode="temporal" in compile(). This argument is not
+          supported when `x` is a dataset or a dataset iterator.
+        class_weight: Optional dictionary mapping class indices (integers) to a
+          weight (float) to apply to the model's loss for the samples from this
+          class during training. This can be useful to tell the model to "pay
+          more attention" to samples from an under-represented class.
 
     Returns:
         Scalar training loss
@@ -1948,7 +1948,7 @@ class Model(Network):
       outputs = training_eager.train_on_batch(
           self, x, y, sample_weights=sample_weights)
     else:
-      if self.uses_learning_phase and not isinstance(K.learning_phase(), int):
+      if not isinstance(K.learning_phase(), int):
         ins = x + y + sample_weights + [1]
       else:
         ins = x + y + sample_weights
@@ -2007,7 +2007,7 @@ class Model(Network):
       outputs = training_eager.test_on_batch(
           self, x, y, sample_weights=sample_weights)
     else:
-      if self.uses_learning_phase and not isinstance(K.learning_phase(), int):
+      if not isinstance(K.learning_phase(), int):
         ins = x + y + sample_weights + [0]
       else:
         ins = x + y + sample_weights
@@ -2052,7 +2052,7 @@ class Model(Network):
       return self(inputs)  # pylint: disable=not-callable
 
     if not context.executing_eagerly():
-      if self.uses_learning_phase and not isinstance(K.learning_phase(), int):
+      if not isinstance(K.learning_phase(), int):
         ins = inputs + [0]
       else:
         ins = inputs
diff --git a/tensorflow/python/keras/engine/training_arrays.py b/tensorflow/python/keras/engine/training_arrays.py
index bad1cb40f24..a2a13b9bd60 100644
--- a/tensorflow/python/keras/engine/training_arrays.py
+++ b/tensorflow/python/keras/engine/training_arrays.py
@@ -19,6 +19,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import functools
+
 import numpy as np
 
 from tensorflow.python.framework import errors
@@ -26,7 +28,6 @@ from tensorflow.python.keras import backend as K
 from tensorflow.python.keras import callbacks as cbks
 from tensorflow.python.keras.engine import training_utils
 from tensorflow.python.keras.utils.generic_utils import make_batches
-from tensorflow.python.keras.utils.generic_utils import Progbar
 from tensorflow.python.keras.utils.generic_utils import slice_arrays
 from tensorflow.python.platform import tf_logging as logging
 
@@ -36,22 +37,165 @@ except ImportError:
   issparse = None
 
 
-def fit_loop(model,
-             inputs,
-             targets,
-             sample_weights=None,
-             batch_size=None,
-             epochs=100,
-             verbose=1,
-             callbacks=None,
-             val_inputs=None,
-             val_targets=None,
-             val_sample_weights=None,
-             shuffle=True,
-             initial_epoch=0,
-             steps_per_epoch=None,
-             validation_steps=None):
-  """Abstract fit function for arrays of data.
+class Aggregator(object):
+  """Abstract base class used to aggregate batch-level outputs of a loop.
+
+  Arguments:
+    use_steps: Whether the loop is using `step` or `batch_size`.
+    num_samples_or_steps: Either `batch_size*num_batches` or `steps`.
+  """
+
+  def __init__(self, use_steps, num_samples_or_steps):
+    self.use_steps = use_steps
+    self.num_samples_or_steps = num_samples_or_steps
+    self.results = []
+
+  def create(self, batch_outs):
+    """Create the initial results from the first batch outputs.
+
+    Arguments:
+      batch_outs: A list of batch-level outputs.
+    """
+    raise NotImplementedError
+
+  def aggregate(self, batch_outs, batch_start=None, batch_end=None):
+    """Aggregate batch-level results into total results.
+
+    Arguments:
+      batch_outs: A list of batch-level outputs.
+      batch_start: The start index of this batch. Always `None` if `use_steps`
+        is `True`.
+      batch_end: The end index of this batch. Always `None` if `use_steps` is
+        `True`.
+    """
+    raise NotImplementedError
+
+  def finalize(self):
+    """Prepare the total results to be returned."""
+    raise NotImplementedError
+
+
+class MetricsAggregator(Aggregator):
+  """Aggregator that calculates loss and metrics info."""
+
+  def create(self, batch_outs):
+    self.results = [0.] * len(batch_outs)
+
+  def aggregate(self, batch_outs, batch_start=None, batch_end=None):
+    # Loss.
+    if self.use_steps:
+      self.results[0] += batch_outs[0]
+    else:
+      self.results[0] += batch_outs[0] * (batch_end - batch_start)
+    # Metrics (always stateful, just grab current values.)
+    self.results[1:] = batch_outs[1:]
+
+  def finalize(self):
+    self.results[0] /= self.num_samples_or_steps
+
+
+class OutputsAggregator(Aggregator):
+  """Aggregator that concatenates outputs."""
+
+  def create(self, batch_outs):
+    if self.use_steps:
+      # Cannot pre-allocate the returned NumPy arrays bc
+      # batch sizes are unknown. Concatenate batches at the end.
+      for _ in batch_outs:
+        self.results.append([])
+    else:
+      # Pre-allocate NumPy arrays.
+      for batch_out in batch_outs:
+        shape = (self.num_samples_or_steps,) + batch_out.shape[1:]
+        self.results.append(np.zeros(shape, dtype=batch_out.dtype))
+
+  def aggregate(self, batch_outs, batch_start=None, batch_end=None):
+    if self.use_steps:
+      for i, batch_out in enumerate(batch_outs):
+        self.results[i].append(batch_out)
+    else:
+      for i, batch_out in enumerate(batch_outs):
+        self.results[i][batch_start:batch_end] = batch_out
+
+  def finalize(self):
+    if self.use_steps:
+      self.results = [np.concatenate(result, axis=0) for result in self.results]
+
+
+def _get_model_feed(model, mode):
+  if mode == 'predict':
+    feed = model._feed_inputs
+  else:
+    feed = (
+        model._feed_inputs + model._feed_targets + model._feed_sample_weights)
+  return feed
+
+
+def _validate_arguments(steps_per_epoch, validation_steps, kwargs):
+  for k in kwargs:
+    if k != 'steps':
+      raise ValueError('Invalid argument passed: {}'.format(k))
+
+  # Validate inputs when in training mode.
+  if validation_steps and steps_per_epoch is None:
+    raise ValueError('Can only use `validation_steps` '
+                     'when doing step-wise '
+                     'training, i.e. `steps_per_epoch` '
+                     'must be set.')
+
+
+def _print_train_info(inputs, val_inputs, steps_per_epoch, verbose):
+  if (val_inputs and steps_per_epoch is None and verbose and inputs and
+      hasattr(inputs[0], 'shape') and hasattr(val_inputs[0], 'shape')):
+    print('Train on %d samples, validate on %d samples' %
+          (inputs[0].shape[0], val_inputs[0].shape[0]))
+
+
+def _get_progbar(model, count_mode):
+  stateful_metric_names = None
+  if hasattr(model, 'metrics_names'):
+    stateful_metric_names = model.metrics_names[1:]  # Exclude `loss`
+  return cbks.ProgbarLogger(count_mode, stateful_metrics=stateful_metric_names)
+
+
+def _get_num_samples_or_steps(ins, batch_size, steps_per_epoch):
+  """Returns total number of samples (when training in batch mode) or steps."""
+  if steps_per_epoch:
+    return steps_per_epoch
+  return training_utils.check_num_samples(ins, batch_size, steps_per_epoch,
+                                          'steps_per_epoch')
+
+
+def _make_logs(model, outputs, mode, prefix=''):
+  """Used to make logs to send to `on_batch_end` methods."""
+  logs = {}
+  # TODO(omalleyt): handle outputs in prediction when Callback
+  # hooks are ready.
+  if mode in ['train', 'test']:
+    if hasattr(model, 'metrics_names'):
+      for label, output in zip(model.metrics_names, outputs):
+        logs[prefix + label] = output
+  return logs
+
+
+def model_iteration(model,
+                    inputs,
+                    targets=None,
+                    sample_weights=None,
+                    batch_size=None,
+                    epochs=1,
+                    verbose=1,
+                    callbacks=None,
+                    val_inputs=None,
+                    val_targets=None,
+                    val_sample_weights=None,
+                    shuffle=True,
+                    initial_epoch=0,
+                    steps_per_epoch=None,
+                    validation_steps=None,
+                    mode='train',
+                    **kwargs):
+  """Loop function for arrays of data with modes 'train'/'test'/'predict'.
 
   Arguments:
       model: Keras Model instance.
@@ -66,52 +210,52 @@ def fit_loop(model,
       val_targets: List of target arrays.
       val_sample_weights: Optional list of sample weight arrays.
       shuffle: Whether to shuffle the data at the beginning of each epoch
-          concatenation of list the display names of the outputs of
-           `f` and the list of display names of the outputs of `f_val`.
-      initial_epoch: Epoch at which to start training
-          (useful for resuming a previous training run)
-      steps_per_epoch: Total number of steps (batches of samples)
-          before declaring one epoch finished and starting the
-          next epoch. Ignored with the default value of `None`.
-      validation_steps: Number of steps to run validation for
-          (only if doing validation from data tensors).
-          Ignored with the default value of `None`.
+        concatenation of list the display names of the outputs of `f` and the
+        list of display names of the outputs of `f_val`.
+      initial_epoch: Epoch at which to start training (useful for resuming a
+        previous training run)
+      steps_per_epoch: Total number of steps (batches of samples) before
+        declaring one epoch finished and starting the next epoch. Ignored with
+        the default value of `None`.
+      validation_steps: Number of steps to run validation for (only if doing
+        validation from data tensors). Ignored with the default value of `None`.
+      mode: One of 'train'/'test'/'predict'.
+      **kwargs: Additional arguments for backwards compatibility.
 
   Returns:
-      `History` object.
+      - In 'train' mode: `History` object.
+      - In 'test' mode: Evaluation metrics.
+      - In 'predict' mode: Outputs of the Model called on inputs.
 
   Raises:
       ValueError: in case of invalid arguments.
   """
-  model._make_fit_function()
-  f = model._fit_function
+  # Backwards compatibility.
+  if 'steps' in kwargs:
+    steps_per_epoch = kwargs['steps']
 
-  sample_weights = sample_weights or []
-  val_sample_weights = val_sample_weights or []
+  _validate_arguments(steps_per_epoch, validation_steps, kwargs)
+  if mode == 'train':
+    _print_train_info(inputs, val_inputs, steps_per_epoch, verbose)
+
+  # Get step function and loop type.
+  f = model._get_execution_function(mode)
+  use_steps = steps_per_epoch is not None
+  do_validation = val_inputs is not None
+
+  # Prepare input data.
   inputs = training_utils.ModelInputs(inputs).as_list()
-  if model.uses_learning_phase and not isinstance(K.learning_phase(), int):
-    ins = inputs + targets + sample_weights + [1]
-  else:
-    ins = inputs + targets + sample_weights
+  targets = targets or []
+  sample_weights = sample_weights or []
+  learning_phase_input = []
+  if not isinstance(K.learning_phase(), int):
+    learning_phase_input = [1] if mode == 'train' else [0]
+  ins = inputs + targets + sample_weights + learning_phase_input
+  num_samples_or_steps = _get_num_samples_or_steps(ins, batch_size,
+                                                   steps_per_epoch)
 
-  do_validation = False
-  if val_inputs:
-    do_validation = True
-    if (steps_per_epoch is None and verbose and inputs and
-        hasattr(inputs[0], 'shape') and hasattr(val_inputs[0], 'shape')):
-      print('Train on %d samples, validate on %d samples' %
-            (inputs[0].shape[0], val_inputs[0].shape[0]))
-  if validation_steps:
-    do_validation = True
-    if steps_per_epoch is None:
-      raise ValueError('Can only use `validation_steps` '
-                       'when doing step-wise '
-                       'training, i.e. `steps_per_epoch` '
-                       'must be set.')
-
-  num_train_samples = training_utils.check_num_samples(
-      ins, batch_size, steps_per_epoch, 'steps_per_epoch')
-  count_mode = 'steps' if steps_per_epoch else 'samples'
+  # Configure callbacks.
+  count_mode = 'steps' if use_steps else 'samples'
   callbacks = cbks.configure_callbacks(
       callbacks,
       model,
@@ -122,36 +266,56 @@ def fit_loop(model,
       batch_size=batch_size,
       epochs=epochs,
       steps_per_epoch=steps_per_epoch,
-      samples=num_train_samples,
+      samples=num_samples_or_steps,
       validation_steps=validation_steps,
-      verbose=verbose,
-      count_mode=count_mode)
+      verbose=0,  # Handle ProgBarLogger separately in this loop.
+      count_mode=count_mode,
+      mode=mode)
+  # TODO(omalleyt): Handle ProgBar as part of Callbacks once hooks are ready.
+  progbar = _get_progbar(model, count_mode)
+  progbar.params = callbacks.params
+  progbar.params['verbose'] = verbose
 
-  if num_train_samples is not None:
-    index_array = np.arange(num_train_samples)
+  # Find beforehand arrays that need sparse-to-dense conversion.
+  if issparse is not None:
+    indices_for_conversion_to_dense = []
+    feed = _get_model_feed(model, mode)
+    for i, (input_data, feed_tensor) in enumerate(zip(ins, feed)):
+      if issparse(input_data) and not K.is_sparse(feed_tensor):
+        indices_for_conversion_to_dense.append(i)
 
-  # To prevent a slowdown, we find beforehand the arrays that need conversion.
-  feed = model._feed_inputs + model._feed_targets + model._feed_sample_weights
-  indices_for_conversion_to_dense = []
-  for i in range(len(feed)):
-    if issparse is not None and issparse(ins[i]) and not K.is_sparse(feed[i]):
-      indices_for_conversion_to_dense.append(i)
+  # Select aggregation method.
+  if mode == 'predict':
+    aggregator = OutputsAggregator(use_steps, num_samples_or_steps)
+  else:
+    aggregator = MetricsAggregator(use_steps, num_samples_or_steps)
 
-  callbacks.on_train_begin()
+  callbacks.model.stop_training = False
+  callbacks._call_begin_hook(mode)
+  progbar.on_train_begin()
   for epoch in range(initial_epoch, epochs):
-    # Reset stateful metrics
-    for m in model.stateful_metric_functions:
-      m.reset_states()
-    # Update callbacks
-    callbacks.on_epoch_begin(epoch)
+    if callbacks.model.stop_training:
+      break
+
+    # Setup work for each epoch
+    results = []
     epoch_logs = {}
-    if steps_per_epoch is not None:
-      # Step-wise fit loop.
-      for step_index in range(steps_per_epoch):
-        batch_logs = {'batch': step_index, 'size': 1}
-        callbacks.on_batch_begin(step_index, batch_logs)
+    if hasattr(model, 'stateful_metric_functions'):
+      for m in model.stateful_metric_functions:
+        m.reset_states()
+    callbacks.on_epoch_begin(epoch, epoch_logs, mode=mode)
+    progbar.on_epoch_begin(epoch, epoch_logs)
+
+    if use_steps:
+      # Step-wise loop.
+      for step in range(steps_per_epoch):
+        batch_logs = {'batch': step, 'size': 1}
+        callbacks._call_batch_hook(mode, 'begin', step, batch_logs)
+        progbar.on_batch_begin(step, batch_logs)
+
+        # Get outputs.
         try:
-          outs = f(ins)
+          batch_outs = f(ins)
         except errors.OutOfRangeError:
           logging.warning('Your dataset iterator ran out of data; '
                           'interrupting training. Make sure that your dataset '
@@ -161,42 +325,36 @@ def fit_loop(model,
                           'dataset.' %
                           steps_per_epoch * epochs)
           break
+        if not isinstance(batch_outs, list):
+          batch_outs = [batch_outs]
 
-        if not isinstance(outs, list):
-          outs = [outs]
-        for l, o in zip(model.metrics_names, outs):
-          batch_logs[l] = o
+        # Aggregate results.
+        if step == 0:
+          aggregator.create(batch_outs)
+        aggregator.aggregate(batch_outs)
+
+        # Callbacks batch end.
+        batch_logs.update(_make_logs(model, batch_outs, mode))
+        callbacks._call_batch_hook(mode, 'end', step, batch_logs)
+        progbar.on_batch_end(step, batch_logs)
 
-        callbacks.on_batch_end(step_index, batch_logs)
         if callbacks.model.stop_training:
           break
-
-      if do_validation:
-        val_outs = test_loop(
-            model,
-            val_inputs,
-            val_targets,
-            sample_weights=val_sample_weights,
-            steps=validation_steps,
-            verbose=0)
-        if not isinstance(val_outs, list):
-          val_outs = [val_outs]
-        # Same labels assumed.
-        for l, o in zip(model.metrics_names, val_outs):
-          epoch_logs['val_' + l] = o
     else:
-      # Sample-wise fit loop.
+      # Sample-wise loop.
+      index_array = np.arange(num_samples_or_steps)
       if shuffle == 'batch':
         index_array = training_utils.batch_shuffle(index_array, batch_size)
       elif shuffle:
         np.random.shuffle(index_array)
-
-      batches = make_batches(num_train_samples, batch_size)
+      batches = make_batches(num_samples_or_steps, batch_size)
 
       for batch_index, (batch_start, batch_end) in enumerate(batches):
         batch_ids = index_array[batch_start:batch_end]
+
+        # Slice into a batch.
         try:
-          if isinstance(ins[-1], int):
+          if ins and isinstance(ins[-1], int):
             # Do not slice the training phase flag.
             ins_batch = slice_arrays(ins[:-1], batch_ids) + [ins[-1]]
           else:
@@ -205,240 +363,66 @@ def fit_loop(model,
           raise TypeError('TypeError while preparing batch. '
                           'If using HDF5 input data, '
                           'pass shuffle="batch".')
-        batch_logs = {}
-        batch_logs['batch'] = batch_index
-        batch_logs['size'] = len(batch_ids)
-        callbacks.on_batch_begin(batch_index, batch_logs)
+
+        # Sparse to dense conversion.
         for i in indices_for_conversion_to_dense:
           ins_batch[i] = ins_batch[i].toarray()
 
-        outs = f(ins_batch)
-        if not isinstance(outs, list):
-          outs = [outs]
-        for l, o in zip(model.metrics_names, outs):
-          batch_logs[l] = o
+        # Callbacks batch_begin.
+        batch_logs = {'batch': batch_index, 'size': len(batch_ids)}
+        callbacks._call_batch_hook(mode, 'begin', batch_index, batch_logs)
+        progbar.on_batch_begin(batch_index, batch_logs)
+
+        # Get outputs.
+        batch_outs = f(ins_batch)
+        if not isinstance(batch_outs, list):
+          batch_outs = [batch_outs]
+
+        # Aggregate results.
+        if batch_index == 0:
+          aggregator.create(batch_outs)
+        aggregator.aggregate(batch_outs, batch_start, batch_end)
+
+        # Callbacks batch end.
+        batch_logs.update(_make_logs(model, batch_outs, mode))
+        callbacks._call_batch_hook(mode, 'end', batch_index, batch_logs)
+        progbar.on_batch_end(batch_index, batch_logs)
 
-        callbacks.on_batch_end(batch_index, batch_logs)
         if callbacks.model.stop_training:
           break
 
-        if batch_index == len(batches) - 1:  # Last batch.
-          if do_validation:
-            val_outs = test_loop(
-                model,
-                val_inputs,
-                val_targets,
-                sample_weights=val_sample_weights,
-                batch_size=batch_size,
-                verbose=0)
-            if not isinstance(val_outs, list):
-              val_outs = [val_outs]
-            # Same labels assumed.
-            for l, o in zip(model.metrics_names, val_outs):
-              epoch_logs['val_' + l] = o
-    callbacks.on_epoch_end(epoch, epoch_logs)
-    if callbacks.model.stop_training:
-      break
-  callbacks.on_train_end()
-  return model.history
+    aggregator.finalize()
+    results = aggregator.results
+    epoch_logs.update(_make_logs(model, results, mode))
+    if len(results) == 1:
+      results = results[0]
+
+    # Run the test loop every epoch during training.
+    if do_validation and not callbacks.model.stop_training:
+      val_results = model_iteration(
+          model,
+          val_inputs,
+          targets=val_targets,
+          sample_weights=val_sample_weights,
+          batch_size=batch_size,
+          steps_per_epoch=validation_steps,
+          callbacks=callbacks,
+          verbose=0,
+          mode='test')
+      if not isinstance(val_results, list):
+        val_results = [val_results]
+      epoch_logs.update(_make_logs(model, val_results, mode, prefix='val_'))
+
+    callbacks.on_epoch_end(epoch, epoch_logs, mode=mode)
+    progbar.on_epoch_end(epoch, epoch_logs)
+  callbacks._call_end_hook(mode)
+
+  if mode == 'train':
+    return model.history
+  return results
 
 
-def predict_loop(model, inputs, batch_size=32, verbose=0, steps=None):
-  """Abstract method to loop over some data in batches.
-
-  Arguments:
-      model: Keras Model instance.
-      inputs: list of tensors to be fed to `f`.
-      batch_size: integer batch size.
-      verbose: verbosity mode.
-      steps: Total number of steps (batches of samples)
-          before declaring `_predict_loop` finished.
-          Ignored with the default value of `None`.
-
-  Returns:
-      Array of predictions (if the model has a single output)
-      or list of arrays of predictions
-      (if the model has multiple outputs).
-  """
-  model._make_predict_function()
-  f = model.predict_function
-
-  inputs = training_utils.ModelInputs(inputs).as_list()
-  if model.uses_learning_phase and not isinstance(K.learning_phase(), int):
-    ins = inputs + [0]
-  else:
-    ins = inputs
-
-  num_samples = training_utils.check_num_samples(
-      inputs, batch_size, steps, 'steps')
-  if verbose == 1:
-    if steps is not None:
-      progbar = Progbar(target=steps)
-    else:
-      progbar = Progbar(target=num_samples)
-
-  indices_for_conversion_to_dense = []
-  for i in range(len(model._feed_inputs)):
-    if (issparse is not None and issparse(inputs[i]) and
-        not K.is_sparse(model._feed_inputs[i])):
-      indices_for_conversion_to_dense.append(i)
-
-  if steps is not None:
-    # Step-based predictions.
-    # Since we do not know how many samples
-    # we will see, we cannot pre-allocate
-    # the returned Numpy arrays.
-    # Instead, we store one array per batch seen
-    # and concatenate them upon returning.
-    unconcatenated_outs = []
-    for step in range(steps):
-      batch_outs = f(ins)
-      if not isinstance(batch_outs, list):
-        batch_outs = [batch_outs]
-      if step == 0:
-        for batch_out in batch_outs:
-          unconcatenated_outs.append([])
-      for i, batch_out in enumerate(batch_outs):
-        unconcatenated_outs[i].append(batch_out)
-      if verbose == 1:
-        progbar.update(step + 1)
-    if len(unconcatenated_outs) == 1:
-      return np.concatenate(unconcatenated_outs[0], axis=0)
-    return [
-        np.concatenate(unconcatenated_outs[i], axis=0)
-        for i in range(len(unconcatenated_outs))
-    ]
-  else:
-    # Sample-based predictions.
-    outs = []
-    batches = make_batches(num_samples, batch_size)
-    index_array = np.arange(num_samples)
-    for batch_index, (batch_start, batch_end) in enumerate(batches):
-      batch_ids = index_array[batch_start:batch_end]
-      if ins and isinstance(ins[-1], int):
-        # Do not slice the training phase flag.
-        ins_batch = slice_arrays(ins[:-1], batch_ids) + [ins[-1]]
-      else:
-        ins_batch = slice_arrays(ins, batch_ids)
-      for i in indices_for_conversion_to_dense:
-        ins_batch[i] = ins_batch[i].toarray()
-
-      batch_outs = f(ins_batch)
-      if not isinstance(batch_outs, list):
-        batch_outs = [batch_outs]
-      if batch_index == 0:
-        # Pre-allocate the results arrays.
-        for batch_out in batch_outs:
-          shape = (num_samples,) + batch_out.shape[1:]
-          outs.append(np.zeros(shape, dtype=batch_out.dtype))
-      for i, batch_out in enumerate(batch_outs):
-        outs[i][batch_start:batch_end] = batch_out
-      if verbose == 1:
-        progbar.update(batch_end)
-    if len(outs) == 1:
-      return outs[0]
-    return outs
-
-
-def test_loop(model,
-              inputs,
-              targets,
-              sample_weights=None,
-              batch_size=None,
-              verbose=0,
-              steps=None):
-  """Abstract method to loop over some data in batches.
-
-  Arguments:
-      model: Keras Model instance.
-      inputs: List of input arrays.
-      targets: List of target arrays.
-      sample_weights: Optional list of sample weight arrays.
-      batch_size: integer batch size or `None`.
-      verbose: verbosity mode.
-      steps: Total number of steps (batches of samples)
-          before declaring predictions finished.
-          Ignored with the default value of `None`.
-
-  Returns:
-      Scalar loss (if the model has a single output and no metrics)
-      or list of scalars (if the model has multiple outputs
-      and/or metrics). The attribute `model.metrics_names` will give you
-      the display labels for the scalar outputs.
-  """
-  model._make_eval_function()
-  f = model._eval_function
-
-  sample_weights = sample_weights or []
-  inputs = training_utils.ModelInputs(inputs).as_list()
-  if model.uses_learning_phase and not isinstance(K.learning_phase(), int):
-    ins = inputs + targets + sample_weights + [0]
-  else:
-    ins = inputs + targets + sample_weights
-
-  if hasattr(model, 'metrics'):
-    for m in model.stateful_metric_functions:
-      m.reset_states()
-
-  num_samples = training_utils.check_num_samples(
-      ins, batch_size, steps, 'steps')
-  outs = []
-  if verbose == 1:
-    if steps is not None:
-      progbar = Progbar(target=steps)
-    else:
-      progbar = Progbar(target=num_samples)
-
-  # To prevent a slowdown, we find beforehand the arrays that need conversion.
-  feed = model._feed_inputs + model._feed_targets + model._feed_sample_weights
-  indices_for_conversion_to_dense = []
-  for i in range(len(feed)):
-    if issparse is not None and issparse(ins[i]) and not K.is_sparse(feed[i]):
-      indices_for_conversion_to_dense.append(i)
-
-  if steps is not None:
-    for step in range(steps):
-      batch_outs = f(ins)
-      if isinstance(batch_outs, list):
-        if step == 0:
-          for _ in enumerate(batch_outs):
-            outs.append(0.)
-        outs[0] += batch_outs[0]  # index 0 = 'loss'
-        outs[1:] = batch_outs[1:]
-      else:
-        if step == 0:
-          outs.append(0.)
-        outs[0] += batch_outs
-      if verbose == 1:
-        progbar.update(step + 1)
-    outs[0] /= steps
-  else:
-    batches = make_batches(num_samples, batch_size)
-    index_array = np.arange(num_samples)
-    for batch_index, (batch_start, batch_end) in enumerate(batches):
-      batch_ids = index_array[batch_start:batch_end]
-      if isinstance(ins[-1], int):
-        # Do not slice the training phase flag.
-        ins_batch = slice_arrays(ins[:-1], batch_ids) + [ins[-1]]
-      else:
-        ins_batch = slice_arrays(ins, batch_ids)
-      for i in indices_for_conversion_to_dense:
-        ins_batch[i] = ins_batch[i].toarray()
-
-      batch_outs = f(ins_batch)
-
-      if isinstance(batch_outs, list):
-        if batch_index == 0:
-          outs.extend([0.] * len(batch_outs))
-        outs[0] += batch_outs[0] * len(batch_ids)  # index 0 = 'loss'
-        outs[1:] = batch_outs[1:]
-      else:
-        if batch_index == 0:
-          outs.append(0.)
-        outs[0] += batch_outs * len(batch_ids)
-      if verbose == 1:
-        progbar.update(batch_end)
-    outs[0] /= num_samples
-  if len(outs) == 1:
-    return outs[0]
-  return outs
+# For backwards compatibility for internal users of these loops.
+fit_loop = functools.partial(model_iteration, mode='train')
+test_loop = functools.partial(model_iteration, mode='test', shuffle=False)
+predict_loop = functools.partial(model_iteration, mode='predict', shuffle=False)
diff --git a/tensorflow/python/keras/engine/training_distributed.py b/tensorflow/python/keras/engine/training_distributed.py
index 8550b960557..808d7c9f333 100644
--- a/tensorflow/python/keras/engine/training_distributed.py
+++ b/tensorflow/python/keras/engine/training_distributed.py
@@ -104,7 +104,7 @@ def fit_loop(
     # `_per_device_fit_function`.
     (grouped_inputs, grouped_outputs, grouped_updates,
      grouped_session_args) = current_strategy.call_for_each_replica(
-         _per_device_fit_function, model._grouped_model)
+         _per_device_fit_function, args=(model._grouped_model,))
     # Unwrap all the per device values returned from `call_for_each_replica`.
     # Unwrapping per device values gives you a list of values that can be
     # used to construct a new train function that is composed of update ops on
@@ -122,7 +122,7 @@ def fit_loop(
         current_strategy, targets)
 
     # Create a train function that is composed of all the parameters above.
-    distributed_fit_function = K.Function(
+    distributed_fit_function = K.function(
         all_inputs,
         all_outputs,
         updates=all_updates,
@@ -131,9 +131,9 @@ def fit_loop(
 
     # We need to set sample_weights to None since there are sample weight
     # placeholders that are created with default values.
-    sample_weights = [None for _ in range(len(model.outputs) *
-                                          current_strategy.num_replicas)]
-    if model.uses_learning_phase and not isinstance(K.learning_phase(), int):
+    sample_weights = [None for _ in range(
+        len(model.outputs) * current_strategy.num_replicas_in_sync)]
+    if not isinstance(K.learning_phase(), int):
       ins = dataset_inputs + dataset_targets + sample_weights + [1]
     else:
       ins = dataset_inputs + dataset_targets
@@ -274,12 +274,12 @@ def _experimental_fit_loop(
 
     (grouped_inputs, grouped_outputs, grouped_updates,
      grouped_session_args) = current_strategy.call_for_each_replica(
-         _per_device_fit_function, model._grouped_model_train)
+         _per_device_fit_function, args=(model._grouped_model_train,))
     (all_inputs, all_outputs, all_updates,
      all_session_args) = distributed_training_utils.unwrap_values(
          current_strategy, grouped_inputs, grouped_outputs,
          grouped_updates, grouped_session_args)
-    combined_fn = K.Function(
+    combined_fn = K.function(
         all_inputs,
         all_outputs,
         updates=all_updates,
@@ -447,7 +447,7 @@ def test_loop(model, iterator, verbose=0, steps=None):
   with current_strategy.scope():
     (grouped_inputs, grouped_outputs, grouped_updates,
      grouped_session_args) = current_strategy.call_for_each_replica(
-         _per_device_eval_function, model._grouped_model)
+         _per_device_eval_function, args=(model._grouped_model,))
 
     (all_inputs, all_outputs, all_updates,
      all_session_args) = distributed_training_utils.unwrap_values(
@@ -459,7 +459,7 @@ def test_loop(model, iterator, verbose=0, steps=None):
     dataset_targets = distributed_training_utils.flatten_perdevice_values(
         current_strategy, targets)
 
-    distributed_test_function = K.Function(
+    distributed_test_function = K.function(
         all_inputs, all_outputs,
         updates=all_updates,
         name='distributed_test_function',
@@ -467,9 +467,9 @@ def test_loop(model, iterator, verbose=0, steps=None):
 
     # We need to set sample_weights to None since there are sample weight
     # placeholders that are created with default values.
-    sample_weights = [None for _ in range(len(model.outputs) *
-                                          current_strategy.num_replicas)]
-    if model.uses_learning_phase and not isinstance(K.learning_phase(), int):
+    sample_weights = [None for _ in range(
+        len(model.outputs) * current_strategy.num_replicas_in_sync)]
+    if not isinstance(K.learning_phase(), int):
       ins = dataset_inputs + dataset_targets + sample_weights + [0]
     else:
       ins = dataset_inputs + dataset_targets
@@ -556,14 +556,14 @@ def _experimental_test_loop(model, iterator, verbose=0, steps=None,
 
     (grouped_inputs, grouped_outputs, grouped_updates,
      grouped_session_args) = current_strategy.call_for_each_replica(
-         _per_device_eval_function, model._grouped_model_test)
+         _per_device_eval_function, args=(model._grouped_model_test,))
 
     (all_inputs, all_outputs, all_updates,
      all_session_args) = distributed_training_utils.unwrap_values(
          current_strategy, grouped_inputs, grouped_outputs, grouped_updates,
          grouped_session_args)
 
-    combined_fn = K.Function(
+    combined_fn = K.function(
         all_inputs, all_outputs,
         updates=all_updates,
         name='distributed_test_function',
@@ -661,7 +661,7 @@ def predict_loop(model, iterator, verbose=0, steps=None):
   with current_strategy.scope():
     (grouped_inputs, grouped_outputs, grouped_updates,
      grouped_session_args) = current_strategy.call_for_each_replica(
-         _per_device_predict_function, model._grouped_model)
+         _per_device_predict_function, args=(model._grouped_model,))
 
     (all_inputs, all_outputs, all_updates,
      all_session_args) = distributed_training_utils.unwrap_values(
@@ -671,13 +671,13 @@ def predict_loop(model, iterator, verbose=0, steps=None):
     dataset_inputs = distributed_training_utils.flatten_perdevice_values(
         current_strategy, inputs)
 
-    distributed_predict_function = K.Function(
+    distributed_predict_function = K.function(
         all_inputs, all_outputs,
         updates=all_updates,
         name='distributed_predict_function',
         **all_session_args)
 
-    if model.uses_learning_phase and not isinstance(K.learning_phase(), int):
+    if not isinstance(K.learning_phase(), int):
       ins = dataset_inputs + [0]
     else:
       ins = dataset_inputs
@@ -691,7 +691,7 @@ def predict_loop(model, iterator, verbose=0, steps=None):
     distributed_training_utils.set_weights(
         current_strategy, distributed_model, orig_model_weights)
 
-    num_towers = current_strategy.num_towers
+    num_replicas = current_strategy.num_replicas_in_sync
     # Since we do not know how many samples we will see, we cannot
     # pre-allocate the returned Numpy arrays. Instead, we store one array per
     # batch seen and concatenate them upon returning.
@@ -703,11 +703,12 @@ def predict_loop(model, iterator, verbose=0, steps=None):
         batch_outs = [batch_outs]
       if step == 0:
         # batch_outs gives you the number of model outputs. In the distributed
-        # case this will be number of model_outputs * num_towers.
+        # case this will be number of model_outputs * num_replicas.
         for _ in range(len(model.outputs)):
           unconcatenated_outs.append([])
       for i in range(len(model.outputs)):
-        nested_outs = batch_outs[i * num_towers:i * num_towers + num_towers]
+        nested_outs = batch_outs[i * num_replicas:
+                                 i * num_replicas + num_replicas]
         outs = nest.flatten(nested_outs)
         unconcatenated_outs[i].extend(outs)
       if verbose >= 1:
@@ -764,14 +765,14 @@ def _experimental_predict_loop(model, iterator, verbose=0, steps=None):
 
     (grouped_inputs, grouped_outputs, grouped_updates,
      grouped_session_args) = current_strategy.call_for_each_replica(
-         _per_device_predict_function, model._grouped_model_predict)
+         _per_device_predict_function, args=(model._grouped_model_predict,))
 
     (all_inputs, all_outputs, all_updates,
      all_session_args) = distributed_training_utils.unwrap_values(
          current_strategy, grouped_inputs, grouped_outputs, grouped_updates,
          grouped_session_args)
 
-    combined_fn = K.Function(
+    combined_fn = K.function(
         all_inputs, all_outputs,
         updates=all_updates,
         name='distributed_predict_function',
@@ -877,7 +878,7 @@ def clone_model_on_replicas(model, strategy, make_callback_model=False,
   """Create a cloned model on each replica."""
   with strategy.scope():
     grouped_model = strategy.call_for_each_replica(
-        _clone_and_build_model, model, inputs, targets)
+        _clone_and_build_model, args=(model, inputs, targets))
     if mode is _Mode.TRAIN:
       model._grouped_model_train = grouped_model
     elif mode is _Mode.TEST:
diff --git a/tensorflow/python/keras/engine/training_test.py b/tensorflow/python/keras/engine/training_test.py
index df5669b5cc1..3cb24255d15 100644
--- a/tensorflow/python/keras/engine/training_test.py
+++ b/tensorflow/python/keras/engine/training_test.py
@@ -18,9 +18,12 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import io
 import logging
+import sys
 
 import numpy as np
+import six
 
 from tensorflow.python import keras
 from tensorflow.python.data.ops import dataset_ops
@@ -33,7 +36,6 @@ from tensorflow.python.keras import metrics as metrics_module
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.callbacks import Callback
 from tensorflow.python.keras.engine.training_utils import weighted_masked_objective
-from tensorflow.python.keras.utils.generic_utils import slice_arrays
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops import variables as variables_lib
@@ -333,44 +335,16 @@ class TrainingTest(test.TestCase):
     })
     self.assertEqual(len(out), 2)
 
-  @tf_test_util.run_in_graph_and_eager_modes
-  def test_invalid_loss(self):
-    num_classes = 5
-    train_samples = 1000
-    test_samples = 1000
-    input_dim = 5
-
-    model = testing_utils.get_small_sequential_mlp(
-        num_hidden=10, num_classes=num_classes, input_dim=input_dim)
-    optimizer = RMSPropOptimizer(learning_rate=0.001)
-    model.compile(optimizer, loss='categorical_crossentropy')
-    np.random.seed(1337)
-    (x_train, y_train), (_, _) = testing_utils.get_test_data(
-        train_samples=train_samples,
-        test_samples=test_samples,
-        input_shape=(input_dim,),
-        num_classes=num_classes)
-
-    with self.assertRaises(ValueError):
-      model.fit(x_train, np.concatenate([y_train, y_train], axis=-1))
-
-    if not context.executing_eagerly():
-      # TODO(psv): Investigate these use cases in eager mode.
-      with self.assertRaises(ValueError):
-        model.fit(x_train, y_train)
-
-      with self.assertRaises(ValueError):
-        model.compile(optimizer, loss=None)
-
   @tf_test_util.run_in_graph_and_eager_modes
   def test_activity_regularizer_fit(self):
     loss = {}
     for reg in [None, 'l2']:
       inputs = keras.layers.Input(shape=(10,))
       x = keras.layers.Dense(
-          10, activation='relu', activity_regularizer=reg)(
-              inputs)
-      outputs = keras.layers.Dense(1, activation='sigmoid')(x)
+          10, activation='relu', activity_regularizer=reg,
+          kernel_initializer='ones', use_bias=False)(inputs)
+      outputs = keras.layers.Dense(1, activation='sigmoid',
+                                   kernel_initializer='ones', use_bias=False)(x)
       model = keras.Model(inputs, outputs)
 
       x = np.ones((10, 10), 'float32')
@@ -512,30 +486,6 @@ class TrainingTest(test.TestCase):
       x2 = model.predict(val_a)
       self.assertAllClose(x1, x2, atol=1e-7)
 
-  @tf_test_util.run_in_graph_and_eager_modes
-  def test_compile_warning_for_loss_missing_output(self):
-    with self.cached_session():
-      inp = keras.layers.Input(shape=(16,), name='input_a')
-      out_1 = keras.layers.Dense(8, name='dense_1')(inp)
-      out_2 = keras.layers.Dense(3, activation='softmax', name='dense_2')(out_1)
-      model = keras.models.Model(inputs=[inp], outputs=[out_1, out_2])
-      optimizer = RMSPropOptimizer(learning_rate=0.001)
-
-      with test.mock.patch.object(logging, 'warning') as mock_log:
-        model.compile(
-            optimizer,
-            loss={
-                'dense_2': 'categorical_crossentropy',
-            },
-            metrics={
-                'dense_2': 'categorical_accuracy',
-                'dense_1': metrics_module.CategoricalAccuracy(),
-            })
-        msg = ('Output "dense_1" missing from loss dictionary. We assume this '
-               'was done on purpose. The fit and evaluate APIs will not be '
-               'expecting any data to be passed to "dense_1".')
-        self.assertRegexpMatches(str(mock_log.call_args), msg)
-
   def test_logs_passed_to_callbacks(self):
     with self.cached_session():
       input_dim = 5
@@ -595,6 +545,62 @@ class TrainingTest(test.TestCase):
           ]))
 
 
+class TestExceptionsAndWarnings(test.TestCase):
+
+  @tf_test_util.run_in_graph_and_eager_modes
+  def test_invalid_loss(self):
+    num_classes = 5
+    train_samples = 1000
+    test_samples = 1000
+    input_dim = 5
+
+    model = testing_utils.get_small_sequential_mlp(
+        num_hidden=10, num_classes=num_classes, input_dim=input_dim)
+    optimizer = RMSPropOptimizer(learning_rate=0.001)
+    model.compile(optimizer, loss='categorical_crossentropy')
+    np.random.seed(1337)
+    (x_train, y_train), (_, _) = testing_utils.get_test_data(
+        train_samples=train_samples,
+        test_samples=test_samples,
+        input_shape=(input_dim,),
+        num_classes=num_classes)
+
+    with self.assertRaises(ValueError):
+      model.fit(x_train, np.concatenate([y_train, y_train], axis=-1))
+
+    if not context.executing_eagerly():
+      # TODO(psv): Investigate these use cases in eager mode.
+      with self.assertRaises(ValueError):
+        model.fit(x_train, y_train)
+
+      with self.assertRaises(ValueError):
+        model.compile(optimizer, loss=None)
+
+  @tf_test_util.run_in_graph_and_eager_modes
+  def test_compile_warning_for_loss_missing_output(self):
+    with self.cached_session():
+      inp = keras.layers.Input(shape=(16,), name='input_a')
+      out_1 = keras.layers.Dense(8, name='dense_1')(inp)
+      out_2 = keras.layers.Dense(3, activation='softmax', name='dense_2')(out_1)
+      model = keras.models.Model(inputs=[inp], outputs=[out_1, out_2])
+      optimizer = RMSPropOptimizer(learning_rate=0.001)
+
+      with test.mock.patch.object(logging, 'warning') as mock_log:
+        model.compile(
+            optimizer,
+            loss={
+                'dense_2': 'categorical_crossentropy',
+            },
+            metrics={
+                'dense_2': 'categorical_accuracy',
+                'dense_1': metrics_module.CategoricalAccuracy(),
+            })
+        msg = ('Output "dense_1" missing from loss dictionary. We assume this '
+               'was done on purpose. The fit and evaluate APIs will not be '
+               'expecting any data to be passed to "dense_1".')
+        self.assertRegexpMatches(str(mock_log.call_args), msg)
+
+
 class LossWeightingTest(test.TestCase):
 
   @tf_test_util.run_in_graph_and_eager_modes
@@ -666,25 +672,6 @@ class LossWeightingTest(test.TestCase):
         x_test[test_ids, :], y_test[test_ids, :], verbose=0)
     self.assertLess(score[0], ref_score[0])
 
-  @tf_test_util.run_in_graph_and_eager_modes
-  def test_sequential_model_fails_with_dict_inputs(self):
-    num_classes = 5
-    model = testing_utils.get_small_sequential_mlp(
-        num_hidden=10, num_classes=num_classes)
-    model.compile(
-        RMSPropOptimizer(learning_rate=0.001),
-        metrics=['acc'],
-        weighted_metrics=['mae'],
-        loss='categorical_crossentropy')
-
-    x = {'dense_input': np.random.random((10, 1))}
-    y = np.random.randint(num_classes, size=(10, 1))
-
-    with self.assertRaisesRegexp(
-        ValueError, 'Passing a dictionary input to a Sequential Model which '
-        'doesnt have FeatureLayer as the first layer is an error'):
-      model.fit(x, y, batch_size=5, epochs=1)
-
   @tf_test_util.run_in_graph_and_eager_modes
   def test_sample_weights(self):
     num_classes = 5
@@ -1073,22 +1060,6 @@ class LossMaskingTest(test.TestCase):
               keras.backend.variable(weights), keras.backend.variable(mask)))
 
 
-class LearningPhaseTest(test.TestCase):
-
-  def test_empty_model_no_learning_phase(self):
-    with self.cached_session():
-      model = keras.models.Sequential()
-      self.assertFalse(model.uses_learning_phase)
-
-  def test_dropout_has_learning_phase(self):
-    with self.cached_session():
-      model = keras.models.Sequential()
-      model.add(keras.layers.Dense(2, input_dim=3))
-      model.add(keras.layers.Dropout(0.5))
-      model.add(keras.layers.Dense(2))
-      self.assertTrue(model.uses_learning_phase)
-
-
 class TestDynamicTrainability(test.TestCase):
 
   def test_trainable_warning(self):
@@ -1232,40 +1203,6 @@ class TestDynamicTrainability(test.TestCase):
       self.assertListEqual(outer_model.trainable_weights, [])
 
 
-class TestTrainingUtils(test.TestCase):
-
-  def test_check_array_lengths(self):
-    keras.engine.training_utils.check_array_lengths(None, None, None)
-    a_np = np.random.random((4, 3, 3))
-    keras.engine.training_utils.check_array_lengths(a_np, a_np, a_np)
-    keras.engine.training_utils.check_array_lengths(
-        [a_np, a_np], [a_np, a_np], [a_np, a_np])
-    keras.engine.training_utils.check_array_lengths([None], [None], [None])
-
-    b_np = np.random.random((3, 4))
-    with self.assertRaises(ValueError):
-      keras.engine.training_utils.check_array_lengths([a_np], [b_np], None)
-
-  def test_slice_arrays(self):
-    input_a = np.random.random((10, 3))
-    slice_arrays(input_a, 0)
-    slice_arrays(None)
-    slice_arrays(input_a, 0, 1)
-    slice_arrays(input_a, stop=2)
-    input_a = [None, [1, 1], None, [1, 1]]
-    slice_arrays(input_a, 0)
-    slice_arrays(input_a, 0, 1)
-    slice_arrays(input_a, stop=2)
-    input_a = [None]
-    slice_arrays(input_a, 0)
-    slice_arrays(input_a, 0, 1)
-    slice_arrays(input_a, stop=2)
-    input_a = None
-    slice_arrays(input_a, 0)
-    slice_arrays(input_a, 0, 1)
-    slice_arrays(input_a, stop=2)
-
-
 class TestTrainingWithDataTensors(test.TestCase):
 
   def test_training_and_eval_methods_on_symbolic_tensors_single_io(self):
@@ -2224,7 +2161,7 @@ class TestTrainingWithMetrics(test.TestCase):
     w = np.random.random((50, 2))
     mse1 = model.evaluate(x, y, sample_weight=w, batch_size=5)[2]
     mse2 = model.evaluate(x, y, sample_weight=w, batch_size=10)[2]
-    self.assertEqual(mse1, mse2)
+    self.assertNear(mse1, mse2, err=1e-7)
 
   @tf_test_util.run_in_graph_and_eager_modes
   def test_metric_state_reset_between_fit_and_evaluate(self):
@@ -2288,6 +2225,19 @@ class TestTrainingWithMetrics(test.TestCase):
       scores = model.train_on_batch(x, y, sample_weight=w)
       self.assertArrayNear(scores, [0.2, 0.8], 0.1)
 
+  @tf_test_util.run_in_graph_and_eager_modes
+  def test_logging(self):
+    mock_stdout = io.BytesIO() if six.PY2 else io.StringIO()
+    model = keras.models.Sequential()
+    model.add(keras.layers.Dense(10, activation='relu'))
+    model.add(keras.layers.Dense(1, activation='sigmoid'))
+    model.compile(
+        RMSPropOptimizer(learning_rate=0.001), loss='binary_crossentropy')
+    with test.mock.patch.object(sys, 'stdout', mock_stdout):
+      model.fit(
+          np.ones((10, 10), 'float32'), np.ones((10, 1), 'float32'), epochs=10)
+    self.assertTrue('Epoch 5/10' in mock_stdout.getvalue())
+
   def test_losses_in_defun(self):
     with context.eager_mode():
       layer = keras.layers.Dense(1, kernel_regularizer='l1')
diff --git a/tensorflow/python/keras/engine/training_utils_test.py b/tensorflow/python/keras/engine/training_utils_test.py
index ff15c49a926..7b217cf3732 100644
--- a/tensorflow/python/keras/engine/training_utils_test.py
+++ b/tensorflow/python/keras/engine/training_utils_test.py
@@ -148,6 +148,18 @@ class TrainingUtilTest(test.TestCase):
     any_true = training_utils._nested_any(nested_data, lambda x: x)
     self.assertEquals(any_true, False)
 
+  def test_check_array_lengths(self):
+    training_utils.check_array_lengths(None, None, None)
+    a_np = np.random.random((4, 3, 3))
+    training_utils.check_array_lengths(a_np, a_np, a_np)
+    training_utils.check_array_lengths(
+        [a_np, a_np], [a_np, a_np], [a_np, a_np])
+    training_utils.check_array_lengths([None], [None], [None])
+
+    b_np = np.random.random((3, 4))
+    with self.assertRaises(ValueError):
+      training_utils.check_array_lengths([a_np], [b_np], None)
+
 
 class ModelInputsTest(test.TestCase):
 
diff --git a/tensorflow/python/keras/layers/convolutional_recurrent.py b/tensorflow/python/keras/layers/convolutional_recurrent.py
index e61dd3043d9..100542129bf 100644
--- a/tensorflow/python/keras/layers/convolutional_recurrent.py
+++ b/tensorflow/python/keras/layers/convolutional_recurrent.py
@@ -391,10 +391,6 @@ class ConvRNN2D(RNN):
     else:
       output = last_output
 
-    # Properly set learning phase
-    if getattr(last_output, '_uses_learning_phase', False):
-      output._uses_learning_phase = True
-
     if self.return_state:
       if not isinstance(states, (list, tuple)):
         states = [states]
@@ -723,11 +719,6 @@ class ConvLSTM2DCell(Layer):
     c = f * c_tm1 + i * self.activation(x_c + h_c)
     o = self.recurrent_activation(x_o + h_o)
     h = o * self.activation(c)
-
-    if 0 < self.dropout + self.recurrent_dropout:
-      if training is None:
-        h._uses_learning_phase = True
-
     return h, [h, c]
 
   def input_conv(self, x, w, b=None, padding='valid'):
diff --git a/tensorflow/python/keras/layers/core.py b/tensorflow/python/keras/layers/core.py
index c257b25b3ac..e5c37be0aa1 100644
--- a/tensorflow/python/keras/layers/core.py
+++ b/tensorflow/python/keras/layers/core.py
@@ -136,7 +136,6 @@ class Dropout(Layer):
     return nn_ops._get_noise_shape(inputs, self.noise_shape)  # pylint: disable=protected-access
 
   def call(self, inputs, training=None):
-    original_training_value = training
     if training is None:
       training = K.learning_phase()
 
@@ -147,9 +146,6 @@ class Dropout(Layer):
     output = tf_utils.smart_cond(training,
                                  dropped_inputs,
                                  lambda: array_ops.identity(inputs))
-    # EagerTensor object has no attribute _uses_learning_phase
-    if not context.executing_eagerly() and original_training_value is None:
-      output._uses_learning_phase = True  # pylint: disable=protected-access
     return output
 
   def compute_output_shape(self, input_shape):
diff --git a/tensorflow/python/keras/layers/embeddings.py b/tensorflow/python/keras/layers/embeddings.py
index 682b5614394..5d805ea684f 100644
--- a/tensorflow/python/keras/layers/embeddings.py
+++ b/tensorflow/python/keras/layers/embeddings.py
@@ -82,10 +82,10 @@ class Embedding(Layer):
         (without it, the shape of the dense outputs cannot be computed).
 
   Input shape:
-      2D tensor with shape: `(batch_size, sequence_length)`.
+      2D tensor with shape: `(batch_size, input_length)`.
 
   Output shape:
-      3D tensor with shape: `(batch_size, sequence_length, output_dim)`.
+      3D tensor with shape: `(batch_size, input_length, output_dim)`.
 
   """
 
diff --git a/tensorflow/python/keras/layers/local_test.py b/tensorflow/python/keras/layers/local_test.py
index 8589b32b3c5..300e7c96545 100644
--- a/tensorflow/python/keras/layers/local_test.py
+++ b/tensorflow/python/keras/layers/local_test.py
@@ -27,7 +27,7 @@ from tensorflow.python.platform import test
 from tensorflow.python.training.rmsprop import RMSPropOptimizer
 
 
-class LocallyConnectedLayersTest(test.TestCase):
+class LocallyConnected1DLayersTest(test.TestCase):
 
   @tf_test_util.run_in_graph_and_eager_modes
   def test_locallyconnected_1d(self):
@@ -111,6 +111,9 @@ class LocallyConnectedLayersTest(test.TestCase):
               self.assertEqual(layer.kernel.constraint, k_constraint)
               self.assertEqual(layer.bias.constraint, b_constraint)
 
+
+class LocallyConnected2DLayersTest(test.TestCase):
+
   @tf_test_util.run_in_graph_and_eager_modes
   def test_locallyconnected_2d(self):
     num_samples = 8
@@ -175,11 +178,11 @@ class LocallyConnectedLayersTest(test.TestCase):
               input_shape=(num_samples, num_row, num_col, stack_size))
 
   def test_locallyconnected_2d_regularization(self):
-    num_samples = 8
+    num_samples = 2
     filters = 3
     stack_size = 4
     num_row = 6
-    num_col = 10
+    num_col = 7
     for implementation in [1, 2]:
       for padding in ['valid', 'same']:
         kwargs = {
@@ -220,20 +223,24 @@ class LocallyConnectedLayersTest(test.TestCase):
             self.assertEqual(layer.kernel.constraint, k_constraint)
             self.assertEqual(layer.bias.constraint, b_constraint)
 
+
+class LocallyConnectedImplementationModeTest(test.TestCase):
+
   @tf_test_util.run_in_graph_and_eager_modes
   def test_locallyconnected_implementation(self):
-    n_train = 4
-    n_classes = 3
-    n_epochs = 2
+    num_samples = 4
+    num_classes = 3
+    num_epochs = 2
 
     np.random.seed(1)
-    targets = np.random.randint(0, n_classes, (n_train,))
+    targets = np.random.randint(0, num_classes, (num_samples,))
 
-    for width in [1, 17]:
-      for height in [16]:
+    for width in [1, 6]:
+      for height in [7]:
         for filters in [2]:
           for data_format in ['channels_first', 'channels_last']:
-            inputs = get_inputs(data_format, filters, height, n_train, width)
+            inputs = get_inputs(
+                data_format, filters, height, num_samples, width)
 
             for kernel_x in [(3,)]:
               for kernel_y in [()] if width == 1 else [(2,)]:
@@ -246,7 +253,7 @@ class LocallyConnectedLayersTest(test.TestCase):
                           'kernel_size': kernel_x + kernel_y,
                           'strides': stride_x + stride_y,
                           'data_format': data_format,
-                          'n_classes': n_classes,
+                          'num_classes': num_classes,
                           'input_shape': inputs.shape
                       }
 
@@ -264,13 +271,13 @@ class LocallyConnectedLayersTest(test.TestCase):
                       # Train.
                       model_1.fit(x=inputs,
                                   y=targets,
-                                  epochs=n_epochs,
-                                  batch_size=n_train)
+                                  epochs=num_epochs,
+                                  batch_size=num_samples)
 
                       model_2.fit(x=inputs,
                                   y=targets,
-                                  epochs=n_epochs,
-                                  batch_size=n_train)
+                                  epochs=num_epochs,
+                                  batch_size=num_samples)
 
                       # Compare outputs after a few training steps.
                       out_1 = model_1.call(inputs)
@@ -316,7 +323,7 @@ class LocallyConnectedLayersTest(test.TestCase):
       self.assertAllCloseAccordingToType(inputs_2d, inputs_2d_tf)
 
 
-def get_inputs(data_format, filters, height, n_train, width):
+def get_inputs(data_format, filters, height, num_samples, width):
   if data_format == 'channels_first':
     if width == 1:
       input_shape = (filters, height)
@@ -333,7 +340,7 @@ def get_inputs(data_format, filters, height, n_train, width):
     raise NotImplementedError(data_format)
 
   inputs = np.random.normal(0, 1,
-                            (n_train,) + input_shape).astype(np.float32)
+                            (num_samples,) + input_shape).astype(np.float32)
   return inputs
 
 
@@ -352,7 +359,7 @@ def get_model(implementation,
               kernel_size,
               strides,
               layers,
-              n_classes,
+              num_classes,
               data_format,
               input_shape):
   model = keras.Sequential()
@@ -377,7 +384,7 @@ def get_model(implementation,
         implementation=implementation))
 
   model.add(keras.layers.Flatten())
-  model.add(keras.layers.Dense(n_classes))
+  model.add(keras.layers.Dense(num_classes))
   model.compile(
       optimizer=RMSPropOptimizer(0.01),
       metrics=[keras.metrics.categorical_accuracy],
diff --git a/tensorflow/python/keras/layers/lstm_test.py b/tensorflow/python/keras/layers/lstm_test.py
index e0094d99f45..9db697871fe 100644
--- a/tensorflow/python/keras/layers/lstm_test.py
+++ b/tensorflow/python/keras/layers/lstm_test.py
@@ -126,6 +126,18 @@ class LSTMLayerTest(test.TestCase):
                   optimizer=RMSPropOptimizer(0.01))
     model.fit(inputs, targets, epochs=1, batch_size=2, verbose=1)
 
+  def test_masking_with_stacking_LSTM(self):
+    inputs = np.random.random((2, 3, 4))
+    targets = np.abs(np.random.random((2, 3, 5)))
+    targets /= targets.sum(axis=-1, keepdims=True)
+    model = keras.models.Sequential()
+    model.add(keras.layers.Masking(input_shape=(3, 4)))
+    lstm_cells = [keras.layers.LSTMCell(10), keras.layers.LSTMCell(5)]
+    model.add(keras.layers.RNN(lstm_cells, return_sequences=True, unroll=False))
+    model.compile(loss='categorical_crossentropy',
+                  optimizer=RMSPropOptimizer(0.01))
+    model.fit(inputs, targets, epochs=1, batch_size=2, verbose=1)
+
   def test_from_config_LSTM(self):
     layer_class = keras.layers.LSTM
     for stateful in (False, True):
diff --git a/tensorflow/python/keras/layers/normalization.py b/tensorflow/python/keras/layers/normalization.py
index 991a06e0bfa..3d3bf647e6f 100644
--- a/tensorflow/python/keras/layers/normalization.py
+++ b/tensorflow/python/keras/layers/normalization.py
@@ -493,7 +493,6 @@ class BatchNormalization(Layer):
     return (r, d, new_mean, new_variance)
 
   def call(self, inputs, training=None):
-    original_training_value = training
     if training is None:
       training = K.learning_phase()
 
@@ -517,8 +516,6 @@ class BatchNormalization(Layer):
         # Currently never reaches here since fused_batch_norm does not support
         # virtual batching
         outputs = undo_virtual_batching(outputs)
-      if not context.executing_eagerly() and original_training_value is None:
-        outputs._uses_learning_phase = True  # pylint: disable=protected-access
       return outputs
 
     # Compute the axes along which to reduce the mean / variance
@@ -635,8 +632,6 @@ class BatchNormalization(Layer):
 
     if self.virtual_batch_size is not None:
       outputs = undo_virtual_batching(outputs)
-    if not context.executing_eagerly() and original_training_value is None:
-      outputs._uses_learning_phase = True  # pylint: disable=protected-access
     return outputs
 
   def compute_output_shape(self, input_shape):
diff --git a/tensorflow/python/keras/layers/normalization_test.py b/tensorflow/python/keras/layers/normalization_test.py
index b11a350dbf0..92e41287077 100644
--- a/tensorflow/python/keras/layers/normalization_test.py
+++ b/tensorflow/python/keras/layers/normalization_test.py
@@ -134,8 +134,6 @@ class NormalizationLayersTest(test.TestCase):
         np.testing.assert_allclose(np.std(out, axis=(0, 2, 3)), 1.0, atol=1e-1)
 
   def test_batchnorm_convnet_channel_last(self):
-    # keras.backend.set_learning_phase(True)
-
     model = keras.models.Sequential()
     norm = keras.layers.BatchNormalization(
         axis=-1, input_shape=(4, 4, 3), momentum=0.8)
diff --git a/tensorflow/python/keras/layers/recurrent.py b/tensorflow/python/keras/layers/recurrent.py
index d22c38f19e8..979187c719f 100644
--- a/tensorflow/python/keras/layers/recurrent.py
+++ b/tensorflow/python/keras/layers/recurrent.py
@@ -87,18 +87,8 @@ class StackedRNNCells(Layer):
 
   @property
   def state_size(self):
-    # States are a flat list of the individual cell state size.
-    # e.g. states of a 2-layer LSTM would be `[h1, c1, h2, c2]`.
-    # (assuming one LSTM has states [h, c])
-    # In the case of reverse_state_order=True, the state_size will be
-    # [h2, c2, h1, c1].
-    state_size = []
-    for cell in self.cells[::-1] if self.reverse_state_order else self.cells:
-      if _is_multiple_state(cell.state_size):
-        state_size += list(cell.state_size)
-      else:
-        state_size.append(cell.state_size)
-    return tuple(state_size)
+    return tuple(c.state_size for c in
+                 (self.cells[::-1] if self.reverse_state_order else self.cells))
 
   @property
   def output_size(self):
@@ -110,8 +100,6 @@ class StackedRNNCells(Layer):
       return self.cells[-1].state_size
 
   def get_initial_state(self, inputs=None, batch_size=None, dtype=None):
-    # The init state is flattened into a list because state_size is a flattened
-    # list.
     initial_states = []
     for cell in self.cells[::-1] if self.reverse_state_order else self.cells:
       get_initial_state_fn = getattr(cell, 'get_initial_state', None)
@@ -122,39 +110,27 @@ class StackedRNNCells(Layer):
         initial_states.append(_generate_zero_filled_state_for_cell(
             cell, inputs, batch_size, dtype))
 
-    return nest.flatten(initial_states)
+    return tuple(initial_states)
 
   def call(self, inputs, states, constants=None, **kwargs):
     # Recover per-cell states.
-    nested_states = []
-    for cell in self.cells[::-1] if self.reverse_state_order else self.cells:
-      if _is_multiple_state(cell.state_size):
-        nested_states.append(states[:len(cell.state_size)])
-        states = states[len(cell.state_size):]
-      else:
-        nested_states.append([states[0]])
-        states = states[1:]
-    if self.reverse_state_order:
-      nested_states = nested_states[::-1]
+    state_size = (self.state_size[::-1]
+                  if self.reverse_state_order else self.state_size)
+    nested_states = nest.pack_sequence_as(state_size, nest.flatten(states))
 
     # Call the cells in order and store the returned states.
     new_nested_states = []
     for cell, states in zip(self.cells, nested_states):
+      states = states if nest.is_sequence(states) else [states]
       if generic_utils.has_arg(cell.call, 'constants'):
         inputs, states = cell.call(inputs, states, constants=constants,
                                    **kwargs)
       else:
         inputs, states = cell.call(inputs, states, **kwargs)
-
       new_nested_states.append(states)
 
-    # Format the new states as a flat list
-    new_states = []
-    if self.reverse_state_order:
-      new_nested_states = new_nested_states[::-1]
-    for cell_states in new_nested_states:
-      new_states += cell_states
-    return inputs, new_states
+    return inputs, nest.pack_sequence_as(state_size,
+                                         nest.flatten(new_nested_states))
 
   @tf_utils.shape_type_conversion
   def build(self, input_shape):
@@ -865,12 +841,6 @@ class RNN(Layer):
     else:
       output = last_output
 
-    # Properly set learning phase
-    if getattr(last_output, '_uses_learning_phase', False):
-      output._uses_learning_phase = True
-      for state in states:
-        state._uses_learning_phase = True
-
     if self.return_state:
       if not isinstance(states, (list, tuple)):
         states = [states]
@@ -1132,12 +1102,6 @@ class SimpleRNNCell(Layer):
     if self.activation is not None:
       output = self.activation(output)
 
-    # Properly set learning phase on output tensor.
-    if 0 < self.dropout + self.recurrent_dropout:
-      if training is None and not context.executing_eagerly():
-        # This would be harmless to set in eager mode, but eager tensors
-        # disallow setting arbitrary attributes.
-        output._uses_learning_phase = True
     return output, [output]
 
   def get_initial_state(self, inputs=None, batch_size=None, dtype=None):
@@ -1640,12 +1604,6 @@ class GRUCell(Layer):
       hh = self.activation(x_h + recurrent_h)
     # previous and candidate state mixed by update gate
     h = z * h_tm1 + (1 - z) * hh
-    if 0 < self.dropout + self.recurrent_dropout:
-      if training is None and not context.executing_eagerly():
-        # This would be harmless to set in eager mode, but eager tensors
-        # disallow setting arbitrary attributes.
-        h._uses_learning_phase = True
-
     return h, [h]
 
   def get_config(self):
@@ -2030,7 +1988,7 @@ class LSTMCell(Layer):
     self.dropout = min(1., max(0., dropout))
     self.recurrent_dropout = min(1., max(0., recurrent_dropout))
     self.implementation = implementation
-    self.state_size = (self.units, self.units)
+    self.state_size = [self.units, self.units]
     self.output_size = self.units
     self._dropout_mask = None
     self._recurrent_dropout_mask = None
@@ -2171,11 +2129,6 @@ class LSTMCell(Layer):
       c, o = self._compute_carry_and_output_fused(z, c_tm1)
 
     h = o * self.activation(c)
-    if 0 < self.dropout + self.recurrent_dropout:
-      if training is None and not context.executing_eagerly():
-        # This would be harmless to set in eager mode, but eager tensors
-        # disallow setting arbitrary attributes.
-        h._uses_learning_phase = True
     return h, [h, c]
 
   def get_config(self):
diff --git a/tensorflow/python/keras/layers/recurrent_test.py b/tensorflow/python/keras/layers/recurrent_test.py
index 6346b171802..bb14a7a5056 100644
--- a/tensorflow/python/keras/layers/recurrent_test.py
+++ b/tensorflow/python/keras/layers/recurrent_test.py
@@ -120,7 +120,7 @@ class RNNTest(test.TestCase):
              MinimalRNNCell(16, 8),
              MinimalRNNCell(32, 16)]
     layer = keras.layers.RNN(cells)
-    self.assertEqual(layer.cell.state_size, (8, 8, 16, 16, 32, 32))
+    self.assertEqual(layer.cell.state_size, ((8, 8), (16, 16), (32, 32)))
     self.assertEqual(layer.cell.output_size, 32)
     y = layer(x)
     model = keras.models.Model(x, y)
@@ -1044,6 +1044,41 @@ class RNNTest(test.TestCase):
                         second_implementation_output)
     self.assertAllClose(first_implementation_output, tf_lstm_cell_output)
 
+  def test_masking_rnn_with_output_and_states(self):
+
+    class Cell(keras.layers.Layer):
+
+      def __init__(self):
+        self.state_size = None
+        self.output_size = None
+        super(Cell, self).__init__()
+
+      def build(self, input_shape):
+        self.state_size = input_shape[-1]
+        self.output_size = input_shape[-1]
+
+      def call(self, inputs, states):
+        return inputs, [s + 1 for s in states]
+
+    x = keras.Input((3, 1), name='x')
+    x_masked = keras.layers.Masking()(x)
+    s_0 = keras.Input((1,), name='s_0')
+    y, s = keras.layers.RNN(
+        Cell(), return_state=True)(x_masked, initial_state=s_0)
+    model = keras.models.Model([x, s_0], [y, s])
+    model.compile(optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.001),
+                  loss='mse')
+
+    # last time step masked
+    x_np = np.array([[[1.], [2.], [0.]]])
+    s_0_np = np.array([[10.]])
+    y_np, s_np = model.predict([x_np, s_0_np])
+
+    # 1 is added to initial state two times
+    self.assertAllClose(s_np, s_0_np + 2)
+    # Expect last output to be the same as last output before masking
+    self.assertAllClose(y_np, x_np[:, 1, :])
+
 
 class Minimal2DRNNCell(keras.layers.Layer):
   """The minimal 2D RNN cell is a simple combination of 2 1-D RNN cell.
diff --git a/tensorflow/python/keras/layers/wrappers.py b/tensorflow/python/keras/layers/wrappers.py
index c795b2aa7eb..d40f7a2e809 100644
--- a/tensorflow/python/keras/layers/wrappers.py
+++ b/tensorflow/python/keras/layers/wrappers.py
@@ -230,17 +230,12 @@ class TimeDistributed(Wrapper):
     kwargs = {}
     if generic_utils.has_arg(self.layer.call, 'training'):
       kwargs['training'] = training
-    uses_learning_phase = False  # pylint: disable=redefined-outer-name
 
     input_shape = K.int_shape(inputs)
     if input_shape[0]:
       # batch size matters, use rnn-based implementation
       def step(x, _):
-        global uses_learning_phase  # pylint: disable=global-variable-undefined
         output = self.layer.call(x, **kwargs)
-        if hasattr(output, '_uses_learning_phase'):
-          uses_learning_phase = (output._uses_learning_phase or
-                                 uses_learning_phase)
         return output, []
 
       _, outputs, _ = K.rnn(
@@ -268,8 +263,6 @@ class TimeDistributed(Wrapper):
         inner_mask_shape = self._get_shape_tuple((-1,), mask, 2)
         kwargs['mask'] = K.reshape(mask, inner_mask_shape)
       y = self.layer.call(inputs, **kwargs)
-      if hasattr(y, '_uses_learning_phase'):
-        uses_learning_phase = y._uses_learning_phase
       # Shape: (num_samples, timesteps, ...)
       output_shape = self.compute_output_shape(input_shape).as_list()
       output_shape = self._get_shape_tuple(
@@ -281,9 +274,6 @@ class TimeDistributed(Wrapper):
         self.layer.activity_regularizer is not None):
       regularization_loss = self.layer.activity_regularizer(y)
       self.add_loss(regularization_loss, inputs)
-
-    if uses_learning_phase:
-      y._uses_learning_phase = True
     return y
 
   def compute_mask(self, inputs, mask=None):
@@ -595,15 +585,6 @@ class Bidirectional(Wrapper):
       raise ValueError(
           'Unrecognized value for `merge_mode`: %s' % (self.merge_mode))
 
-    # Properly set learning phase
-    if (getattr(y, '_uses_learning_phase', False) or
-        getattr(y_rev, '_uses_learning_phase', False)):
-      if self.merge_mode is None:
-        for out in output:
-          out._uses_learning_phase = True
-      else:
-        output._uses_learning_phase = True
-
     if self.return_state:
       if self.merge_mode is None:
         return output + states
diff --git a/tensorflow/python/keras/layers/wrappers_test.py b/tensorflow/python/keras/layers/wrappers_test.py
index 965960917cc..9584b0186c4 100644
--- a/tensorflow/python/keras/layers/wrappers_test.py
+++ b/tensorflow/python/keras/layers/wrappers_test.py
@@ -452,16 +452,13 @@ class BidirectionalTest(test.TestCase):
       wrapped = keras.layers.Bidirectional(
           rnn(units, dropout=0.2, recurrent_dropout=0.2), merge_mode=merge_mode)
       outputs = _to_list(wrapped(inputs, training=True))
-      assert all(not getattr(x, '_uses_learning_phase') for x in outputs)
 
       inputs = keras.Input((timesteps, dim))
       wrapped = keras.layers.Bidirectional(
           rnn(units, dropout=0.2, return_state=True), merge_mode=merge_mode)
       outputs = _to_list(wrapped(inputs))
-      assert all(x._uses_learning_phase for x in outputs)
 
       model = keras.Model(inputs, outputs)
-      assert model.uses_learning_phase
       y1 = _to_list(model.predict(x))
       y2 = _to_list(model.predict(x))
       for x1, x2 in zip(y1, y2):
diff --git a/tensorflow/python/keras/metrics.py b/tensorflow/python/keras/metrics.py
index 33e526352fa..2ea64055979 100644
--- a/tensorflow/python/keras/metrics.py
+++ b/tensorflow/python/keras/metrics.py
@@ -521,10 +521,12 @@ class Mean(Metric):
       values = math_ops.multiply(values, sample_weight)
     values = math_ops.reduce_sum(values)
 
-    # Update state variables
+    # Update state variables. Count should be updated only when total is
+    # updated.
     update_total_op = state_ops.assign_add(self.total, values)
-    update_count_op = state_ops.assign_add(self.count, num_values)
-    return control_flow_ops.group(update_total_op, update_count_op)
+    with ops.control_dependencies([update_total_op]):
+      update_count_op = state_ops.assign_add(self.count, num_values)
+      return ops.convert_to_tensor(update_count_op)
 
   def result(self):
     return safe_div(self.total, self.count)
diff --git a/tensorflow/python/keras/models.py b/tensorflow/python/keras/models.py
index 0c9c066a852..225c6c6af8e 100644
--- a/tensorflow/python/keras/models.py
+++ b/tensorflow/python/keras/models.py
@@ -206,10 +206,17 @@ def _clone_sequential_model(model, input_tensors=None):
   def clone(layer):
     return layer.__class__.from_config(layer.get_config())
 
-  layers = [clone(layer) for layer in model.layers]
+  # Use model._layers to ensure that all layers are cloned. The model's layers
+  # property will exclude the initial InputLayer (if it exists) in the model,
+  # resulting in a different Sequential model structure.
+  layers = [clone(layer) for layer in model._layers]
   if input_tensors is None:
     return Sequential(layers=layers, name=model.name)
   else:
+    # If input tensors are provided, the original model's InputLayer is
+    # overwritten with a different InputLayer.
+    if isinstance(layers[0], InputLayer):
+      layers = layers[1:]
     if len(generic_utils.to_list(input_tensors)) != 1:
       raise ValueError('To clone a `Sequential` model, we expect '
                        ' at most one tensor '
@@ -452,7 +459,7 @@ def clone_and_build_model(
 
     if all([isinstance(clone, Sequential),
             not clone._is_graph_network,
-            model.built]):
+            getattr(model, '_build_input_shape', None) is not None]):
       # Set model inputs to build the model and add input/output properties.
       # TODO(kathywu): Add multiple placeholders to handle edge case where
       # sequential model has multiple inputs.
diff --git a/tensorflow/python/keras/models_test.py b/tensorflow/python/keras/models_test.py
index 36875cf984f..bf778f14971 100644
--- a/tensorflow/python/keras/models_test.py
+++ b/tensorflow/python/keras/models_test.py
@@ -50,6 +50,21 @@ class TestModel(keras.Model):
     return self.layer1(x)
 
 
+def sequential_model(add_input_layer, include_input_shape=True):
+  model = keras.models.Sequential()
+  if add_input_layer:
+    model.add(keras.layers.InputLayer(input_shape=(4,)))
+    model.add(keras.layers.Dense(4))
+  elif include_input_shape:
+    model.add(keras.layers.Dense(4, input_shape=(4,)))
+  else:
+    model.add(keras.layers.Dense(4))
+  model.add(keras.layers.BatchNormalization())
+  model.add(keras.layers.Dropout(0.5))
+  model.add(keras.layers.Dense(4))
+  return model
+
+
 class TestModelCloning(test.TestCase):
 
   def test_clone_sequential_model(self):
@@ -57,11 +72,7 @@ class TestModelCloning(test.TestCase):
       val_a = np.random.random((10, 4))
       val_out = np.random.random((10, 4))
 
-      model = keras.models.Sequential()
-      model.add(keras.layers.Dense(4, input_shape=(4,)))
-      model.add(keras.layers.BatchNormalization())
-      model.add(keras.layers.Dropout(0.5))
-      model.add(keras.layers.Dense(4))
+      model = sequential_model(False)
 
     # Everything should work in a new session.
     keras.backend.clear_session()
@@ -76,20 +87,55 @@ class TestModelCloning(test.TestCase):
 
       # On top of new tensor
       input_a = keras.Input(shape=(4,))
-      new_model = keras.models.clone_model(
-          model, input_tensors=input_a)
+      new_model = keras.models.clone_model(model, input_tensors=input_a)
       self.assertEquals(len(new_model.get_updates_for(new_model.inputs)), 2)
       new_model.compile('rmsprop', 'mse')
       new_model.train_on_batch(val_a, val_out)
 
       # On top of new, non-Keras tensor
       input_a = keras.backend.variable(val_a)
-      new_model = keras.models.clone_model(
-          model, input_tensors=input_a)
+      new_model = keras.models.clone_model(model, input_tensors=input_a)
       self.assertEquals(len(new_model.get_updates_for(new_model.inputs)), 2)
       new_model.compile('rmsprop', 'mse')
       new_model.train_on_batch(None, val_out)
 
+  def test_clone_sequential_model_input_layer(self):
+    def test_input_layer(include_inputs):
+      with self.cached_session():
+        val_a = np.random.random((10, 4))
+        model = sequential_model(include_inputs, include_inputs)
+        # Sanity check
+        self.assertEqual(
+            isinstance(model._layers[0], keras.layers.InputLayer),
+            include_inputs)
+        self.assertEqual(model._is_graph_network, include_inputs)
+
+      keras.backend.clear_session()
+      with self.cached_session():
+        # With placeholder creation -- clone model should have an InputLayer
+        # if the original model has one.
+        new_model = keras.models.clone_model(model)
+        self.assertEqual(
+            isinstance(new_model._layers[0], keras.layers.InputLayer),
+            include_inputs)
+        self.assertEqual(new_model._is_graph_network, model._is_graph_network)
+
+        # On top of new tensor  -- clone model should always have an InputLayer.
+        input_a = keras.Input(shape=(4,))
+        new_model = keras.models.clone_model(model, input_tensors=input_a)
+        self.assertIsInstance(new_model._layers[0], keras.layers.InputLayer)
+        self.assertTrue(new_model._is_graph_network)
+
+        # On top of new, non-Keras tensor  -- clone model should always have an
+        # InputLayer.
+        input_a = keras.backend.variable(val_a)
+        new_model = keras.models.clone_model(model, input_tensors=input_a)
+        self.assertIsInstance(new_model._layers[0], keras.layers.InputLayer)
+        self.assertTrue(new_model._is_graph_network)
+
+    test_input_layer(True)
+    test_input_layer(False)
+
   def test_clone_functional_model(self):
     with self.cached_session():
       val_a = np.random.random((10, 4))
@@ -401,11 +447,7 @@ class TestCloneAndBuildModel(test.TestCase):
 
   def test_clone_and_build_sequential_model_without_inputs_defined(self):
     with self.cached_session():
-      model = keras.models.Sequential()
-      model.add(keras.layers.Dense(4))
-      model.add(keras.layers.BatchNormalization())
-      model.add(keras.layers.Dropout(0.5))
-      model.add(keras.layers.Dense(4))
+      model = sequential_model(False, False)
       model.compile('rmsprop', 'mse',
                     metrics=['acc', metrics.categorical_accuracy])
     self._clone_and_build_test_helper(model, False)
diff --git a/tensorflow/python/keras/optimizer_v2/BUILD b/tensorflow/python/keras/optimizer_v2/BUILD
index b4b84fad0ce..fb43775fdc4 100644
--- a/tensorflow/python/keras/optimizer_v2/BUILD
+++ b/tensorflow/python/keras/optimizer_v2/BUILD
@@ -46,6 +46,7 @@ cuda_py_test(
         "//tensorflow/python:variables",
         "//tensorflow/python/eager:context",
     ],
+    shard_count = 4,
 )
 
 cuda_py_test(
diff --git a/tensorflow/python/keras/optimizer_v2/gradient_descent.py b/tensorflow/python/keras/optimizer_v2/gradient_descent.py
index 3ee1982af95..90106c941cc 100644
--- a/tensorflow/python/keras/optimizer_v2/gradient_descent.py
+++ b/tensorflow/python/keras/optimizer_v2/gradient_descent.py
@@ -25,12 +25,21 @@ from tensorflow.python.training import training_ops
 
 
 class SGD(optimizer_v2.OptimizerV2):
-  """Stochastic gradient descent optimizer.
+  """Stochastic gradient descent and momentum optimizer.
 
   Computes:
-
   ```
-  variable -= learning_rate * gradient
+  theta(t+1) = theta(t) - learning_rate * gradient
+  gradient is evaluated at theta(t).
+  ```
+
+  or Computes (if `use_nesterov = False`):
+  ```
+  v(t+1) = momentum * v(t) - learning_rate * gradient
+  theta(t+1) = theta(t) + v(t+1)
+  if `nesterov` is False, gradient is evaluated at theta(t).
+  if `nesterov` is True, gradient is evaluated at theta(t) + momentum * v(t),
+    and the variables always store theta + m v instead of theta
   ```
 
   Some of the args below are hyperparameters, where a hyperparameter is
@@ -44,49 +53,90 @@ class SGD(optimizer_v2.OptimizerV2):
   changing these values across different invocations of optimizer functions.
   @end_compatibility
 
-  Arguments:
-      learning_rate: float hyperparameter >= 0. Learning rate.
-      name: Optional name prefix for the operations created when applying
-        gradients.  Defaults to 'SGD'.
+  # References
+      nesterov = True, See [Sutskever et al., 2013](
+        http://jmlr.org/proceedings/papers/v28/sutskever13.pdf).
   """
 
   def __init__(self,
                learning_rate=0.001,
-               momentum=None,
+               momentum=0.0,
                nesterov=False,
                name="SGD"):
+    """Construct a new Stochastic Gradient Descent or Momentum optimizer.
+
+    Arguments:
+      learning_rate: float hyperparameter >= 0. Learning rate.
+      momentum: float hyperparameter >= 0 that accelerates SGD in the relevant
+        direction and dampens oscillations.
+      nesterov: boolean. Whether to apply Nesterov momentum.
+      name: Optional name prefix for the operations created when applying
+        gradients.  Defaults to 'SGD'.
+    """
     super(SGD, self).__init__(name)
     self._set_hyper("learning_rate", learning_rate)
 
-  def _apply_dense(self, grad, var):
-    return training_ops.apply_gradient_descent(
-        var,
-        math_ops.cast(self._get_hyper("learning_rate"), var.dtype.base_dtype),
-        grad,
-        use_locking=self._use_locking).op
+    self._momentum = False
+    if isinstance(momentum, ops.Tensor) or callable(momentum) or momentum > 0:
+      self._momentum = True
+    if isinstance(momentum, (int, float)) and (momentum < 0 or momentum > 1):
+      raise ValueError("`momentum` must be between [0, 1].")
+    self._set_hyper("momentum", momentum)
+
+    self._nesterov = nesterov
+
+  def _create_slots(self, var_list):
+    if self._momentum:
+      for var in var_list:
+        self.add_slot(var, "momentum")
 
   def _resource_apply_dense(self, grad, var):
-    return training_ops.resource_apply_gradient_descent(
-        var.handle,
-        math_ops.cast(self._get_hyper("learning_rate"), var.dtype.base_dtype),
-        grad,
-        use_locking=self._use_locking)
+    learning_rate = self._get_hyper("learning_rate")
+    if self._momentum:
+      momentum_var = self.get_slot(var, "momentum")
+      return training_ops.resource_apply_momentum(
+          var.handle,
+          momentum_var.handle,
+          math_ops.cast(learning_rate, grad.dtype.base_dtype),
+          grad,
+          math_ops.cast(self._get_hyper("momentum"), grad.dtype.base_dtype),
+          use_locking=self._use_locking,
+          use_nesterov=self._nesterov)
+    else:
+      return training_ops.resource_apply_gradient_descent(
+          var.handle,
+          math_ops.cast(learning_rate, grad.dtype.base_dtype),
+          grad,
+          use_locking=self._use_locking)
 
   def _resource_apply_sparse_duplicate_indices(self, grad, var, indices):
-    return resource_variable_ops.resource_scatter_add(
-        var.handle, indices, -grad * math_ops.cast(
-            self._get_hyper("learning_rate"), var.dtype.base_dtype))
+    if self._momentum:
+      return super(SGD, self)._resource_apply_sparse_duplicate_indices(
+          grad, var, indices)
+    else:
+      return resource_variable_ops.resource_scatter_add(
+          var.handle, indices, -grad * math_ops.cast(
+              self._get_hyper("learning_rate"), grad.dtype.base_dtype))
 
-  def _apply_sparse_duplicate_indices(self, grad, var):
-    delta = ops.IndexedSlices(
-        grad.values * math_ops.cast(
-            self._get_hyper("learning_rate"), var.dtype.base_dtype),
-        grad.indices, grad.dense_shape)
-    return var.scatter_sub(delta, use_locking=self._use_locking)
+  def _resource_apply_sparse(self, grad, var, indices):
+    # This method is only needed for momentum optimization.
+    learning_rate = self._get_hyper("learning_rate")
+    momentum_var = self.get_slot(var, "momentum")
+    return training_ops.resource_sparse_apply_momentum(
+        var.handle,
+        momentum_var.handle,
+        math_ops.cast(learning_rate, grad.dtype.base_dtype),
+        grad,
+        indices,
+        math_ops.cast(self._get_hyper("momentum"), grad.dtype.base_dtype),
+        use_locking=self._use_locking,
+        use_nesterov=self._nesterov)
 
   def get_config(self):
     config = super(SGD, self).get_config()
     config.update({
         "learning_rate": self._serialize_hyperparameter("learning_rate"),
+        "momentum": self._serialize_hyperparameter("momentum"),
+        "nesterov": self._nesterov,
     })
     return config
diff --git a/tensorflow/python/keras/optimizer_v2/gradient_descent_test.py b/tensorflow/python/keras/optimizer_v2/gradient_descent_test.py
index 3fb15c51d04..b84bf1a6ecc 100644
--- a/tensorflow/python/keras/optimizer_v2/gradient_descent_test.py
+++ b/tensorflow/python/keras/optimizer_v2/gradient_descent_test.py
@@ -18,46 +18,28 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import numpy as np
+
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
 from tensorflow.python.eager import function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.keras.optimizer_v2 import gradient_descent
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import embedding_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
-from tensorflow.python.ops import resources
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 
 
 class GradientDescentOptimizerTest(test.TestCase):
 
+  @test_util.run_in_graph_and_eager_modes
   def testBasic(self):
-    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
-      with self.cached_session():
-        var0 = variables.Variable([1.0, 2.0], dtype=dtype)
-        var1 = variables.Variable([3.0, 4.0], dtype=dtype)
-        grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
-        grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
-        optimizer = gradient_descent.SGD(3.0)
-        sgd_op = optimizer.apply_gradients(zip([grads0, grads1], [var0, var1]))
-        variables.global_variables_initializer().run()
-        # Fetch params to validate initial values
-        self.assertAllCloseAccordingToType([1.0, 2.0], var0.eval())
-        self.assertAllCloseAccordingToType([3.0, 4.0], var1.eval())
-        # Run 1 step of sgd
-        sgd_op.run()
-        # Validate updated params
-        self.assertAllCloseAccordingToType([1.0 - 3.0 * 0.1, 2.0 - 3.0 * 0.1],
-                                           var0.eval())
-        self.assertAllCloseAccordingToType([3.0 - 3.0 * 0.01, 4.0 - 3.0 * 0.01],
-                                           var1.eval())
-        self.assertEqual(0, len(optimizer.variables()))
-
-  def testBasicResourceVariable(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
       with self.cached_session():
         var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype)
@@ -65,19 +47,18 @@ class GradientDescentOptimizerTest(test.TestCase):
         grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
         grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
         sgd = gradient_descent.SGD(3.0)
+        # self.assertFalse(sgd._initial_decay)
         sgd_op = sgd.apply_gradients(zip([grads0, grads1], [var0, var1]))
-        variables.global_variables_initializer().run()
-        # Fetch params to validate initial values
-        self.assertAllCloseAccordingToType([1.0, 2.0], var0.eval())
-        self.assertAllCloseAccordingToType([3.0, 4.0], var1.eval())
+        self.evaluate(variables.global_variables_initializer())
         # Run 1 step of sgd
-        sgd_op.run()
+        self.evaluate(sgd_op)
         # Validate updated params
         self.assertAllCloseAccordingToType([1.0 - 3.0 * 0.1, 2.0 - 3.0 * 0.1],
-                                           var0.eval())
+                                           self.evaluate(var0))
         self.assertAllCloseAccordingToType([3.0 - 3.0 * 0.01, 4.0 - 3.0 * 0.01],
-                                           var1.eval())
+                                           self.evaluate(var1))
 
+  @test_util.run_in_graph_and_eager_modes
   def testBasicCallableParams(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
       with self.cached_session():
@@ -88,44 +69,34 @@ class GradientDescentOptimizerTest(test.TestCase):
         lr = lambda: 3.0
         sgd = gradient_descent.SGD(lr)
         sgd_op = sgd.apply_gradients(zip([grads0, grads1], [var0, var1]))
-        # TODO(apassos) calling initialize_resources on all resources here
-        # doesn't work because the sessions and graph are reused across unit
-        # tests and this would mean trying to reinitialize variables. Figure out
-        # a long-term solution for this.
-        resources.initialize_resources([var0, var1, sgd.iteration]).run()
-        # Fetch params to validate initial values
-        self.assertAllCloseAccordingToType([1.0, 2.0], var0.eval())
-        self.assertAllCloseAccordingToType([3.0, 4.0], var1.eval())
+        self.evaluate(variables.global_variables_initializer())
         # Run 1 step of sgd
-        sgd_op.run()
+        self.evaluate(sgd_op)
         # Validate updated params
         self.assertAllCloseAccordingToType([1.0 - 3.0 * 0.1, 2.0 - 3.0 * 0.1],
-                                           var0.eval())
+                                           self.evaluate(var0))
         self.assertAllCloseAccordingToType([3.0 - 3.0 * 0.01, 4.0 - 3.0 * 0.01],
-                                           var1.eval())
+                                           self.evaluate(var1))
 
+  @test_util.run_in_graph_and_eager_modes
   def testMinimizeResourceVariable(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
       with self.cached_session():
         var0 = resource_variable_ops.ResourceVariable([[1.0, 2.0]], dtype=dtype)
         var1 = resource_variable_ops.ResourceVariable([3.0], dtype=dtype)
         x = constant_op.constant([[4.0], [5.0]], dtype=dtype)
-        pred = math_ops.matmul(var0, x) + var1
-        loss = pred * pred
+        loss = lambda: math_ops.matmul(var0, x) + var1  # pylint: disable=cell-var-from-loop
+        if not context.executing_eagerly():
+          loss = loss()
         sgd = gradient_descent.SGD(1.0)
         sgd_op = sgd.minimize(loss, [var0, var1])
-        variables.global_variables_initializer().run()
-        # Fetch params to validate initial values
-        self.assertAllCloseAccordingToType([[1.0, 2.0]], var0.eval())
-        self.assertAllCloseAccordingToType([3.0], var1.eval())
+        self.evaluate(variables.global_variables_initializer())
         # Run 1 step of sgd
-        sgd_op.run()
+        self.evaluate(sgd_op)
         # Validate updated params
-        np_pred = 1.0 * 4.0 + 2.0 * 5.0 + 3.0
-        np_grad = 2 * np_pred
-        self.assertAllCloseAccordingToType(
-            [[1.0 - np_grad * 4.0, 2.0 - np_grad * 5.0]], var0.eval())
-        self.assertAllCloseAccordingToType([3.0 - np_grad], var1.eval())
+        self.assertAllCloseAccordingToType([[1.0 - 4.0, 2.0 - 5.0]],
+                                           self.evaluate(var0))
+        self.assertAllCloseAccordingToType([3.0 - 1.0], self.evaluate(var1))
 
   def testMinimizeSparseResourceVariable(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
@@ -137,18 +108,15 @@ class GradientDescentOptimizerTest(test.TestCase):
         pred += var1
         loss = pred * pred
         sgd_op = gradient_descent.SGD(1.0).minimize(loss, [var0, var1])
-        variables.global_variables_initializer().run()
-        # Fetch params to validate initial values
-        self.assertAllCloseAccordingToType([[1.0, 2.0]], var0.eval())
-        self.assertAllCloseAccordingToType([3.0], var1.eval())
+        self.evaluate(variables.global_variables_initializer())
         # Run 1 step of sgd
-        sgd_op.run()
+        self.evaluate(sgd_op)
         # Validate updated params
         np_pred = 1.0 * 4.0 + 2.0 * 5.0 + 3.0
         np_grad = 2 * np_pred
         self.assertAllCloseAccordingToType(
-            [[1.0 - np_grad * 4.0, 2.0 - np_grad * 5.0]], var0.eval())
-        self.assertAllCloseAccordingToType([3.0 - np_grad], var1.eval())
+            [[1.0 - np_grad * 4.0, 2.0 - np_grad * 5.0]], self.evaluate(var0))
+        self.assertAllCloseAccordingToType([3.0 - np_grad], self.evaluate(var1))
 
   def testTensorLearningRate(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
@@ -160,17 +128,14 @@ class GradientDescentOptimizerTest(test.TestCase):
         lrate = constant_op.constant(3.0)
         sgd_op = gradient_descent.SGD(lrate).apply_gradients(
             zip([grads0, grads1], [var0, var1]))
-        variables.global_variables_initializer().run()
-        # Fetch params to validate initial values
-        self.assertAllCloseAccordingToType([1.0, 2.0], var0.eval())
-        self.assertAllCloseAccordingToType([3.0, 4.0], var1.eval())
+        self.evaluate(variables.global_variables_initializer())
         # Run 1 step of sgd
-        sgd_op.run()
+        self.evaluate(sgd_op)
         # Validate updated params
         self.assertAllCloseAccordingToType([1.0 - 3.0 * 0.1, 2.0 - 3.0 * 0.1],
-                                           var0.eval())
+                                           self.evaluate(var0))
         self.assertAllCloseAccordingToType([3.0 - 3.0 * 0.01, 4.0 - 3.0 * 0.01],
-                                           var1.eval())
+                                           self.evaluate(var1))
 
   def testGradWrtRef(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
@@ -179,31 +144,9 @@ class GradientDescentOptimizerTest(test.TestCase):
         values = [1.0, 3.0]
         vars_ = [variables.Variable([v], dtype=dtype) for v in values]
         grads_and_vars = opt.compute_gradients(vars_[0] + vars_[1], vars_)
-        variables.global_variables_initializer().run()
+        self.evaluate(variables.global_variables_initializer())
         for grad, _ in grads_and_vars:
-          self.assertAllCloseAccordingToType([1.0], grad.eval())
-
-  def testWithGlobalStep(self):
-    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
-      with self.cached_session():
-        var0 = variables.Variable([1.0, 2.0], dtype=dtype)
-        var1 = variables.Variable([3.0, 4.0], dtype=dtype)
-        grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
-        grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
-        sgd = gradient_descent.SGD(3.0)
-        sgd_op = sgd.apply_gradients(zip([grads0, grads1], [var0, var1]))
-        variables.global_variables_initializer().run()
-        # Fetch params to validate initial values
-        self.assertAllCloseAccordingToType([1.0, 2.0], var0.eval())
-        self.assertAllCloseAccordingToType([3.0, 4.0], var1.eval())
-        # Run 1 step of sgd
-        sgd_op.run()
-        # Validate updated params and optimizer iterations.
-        self.assertAllCloseAccordingToType([1.0 - 3.0 * 0.1, 2.0 - 3.0 * 0.1],
-                                           var0.eval())
-        self.assertAllCloseAccordingToType([3.0 - 3.0 * 0.01, 4.0 - 3.0 * 0.01],
-                                           var1.eval())
-        self.assertAllCloseAccordingToType(1, sgd.iteration.eval())
+          self.assertAllCloseAccordingToType([1.0], self.evaluate(grad))
 
   def testSparseBasic(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
@@ -218,17 +161,14 @@ class GradientDescentOptimizerTest(test.TestCase):
             constant_op.constant([1]), constant_op.constant([2, 1]))
         sgd_op = gradient_descent.SGD(3.0).apply_gradients(
             zip([grads0, grads1], [var0, var1]))
-        variables.global_variables_initializer().run()
-        # Fetch params to validate initial values
-        self.assertAllCloseAccordingToType([[1.0], [2.0]], var0.eval())
-        self.assertAllCloseAccordingToType([[3.0], [4.0]], var1.eval())
+        self.evaluate(variables.global_variables_initializer())
         # Run 1 step of sgd
-        sgd_op.run()
+        self.evaluate(sgd_op)
         # Validate updated params
         self.assertAllCloseAccordingToType([[1.0 - 3.0 * 0.1], [2.0]],
-                                           var0.eval())
+                                           self.evaluate(var0))
         self.assertAllCloseAccordingToType([[3.0], [4.0 - 3.0 * 0.01]],
-                                           var1.eval())
+                                           self.evaluate(var1))
 
   def testCapturingInDefunWhileExecutingEagerly(self):
     with context.eager_mode():
@@ -251,5 +191,406 @@ class GradientDescentOptimizerTest(test.TestCase):
       self.assertEqual(float(step()), -1.0)
 
 
+class MomentumOptimizerTest(test.TestCase):
+
+  def _update_nesterov_momentum_numpy(self, var, accum, g, lr, momentum):
+    var = var + accum * lr * momentum
+    accum = accum * momentum + g
+    var = var - lr * accum
+    var = var - accum * lr * momentum
+    return var, accum
+
+  @test_util.run_in_graph_and_eager_modes
+  def testBasic(self):
+    for _, dtype in enumerate([dtypes.half, dtypes.float32, dtypes.float64]):
+      with self.cached_session():
+        var0 = resource_variable_ops.ResourceVariable([1.0, 2.0],
+                                                      dtype=dtype,
+                                                      name="var0")
+        var1 = resource_variable_ops.ResourceVariable([3.0, 4.0],
+                                                      dtype=dtype,
+                                                      name="var1")
+        grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
+        grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
+        learning_rate = 2.0
+        momentum = 0.9
+        mom_opt = gradient_descent.SGD(
+            learning_rate=learning_rate, momentum=momentum)
+        # self.assertFalse(mom_opt._initial_decay)
+        mom_update = mom_opt.apply_gradients(
+            zip([grads0, grads1], [var0, var1]))
+
+        # Check we have slots
+        slot0 = mom_opt.get_slot(var0, "momentum")
+        self.assertEquals(slot0.get_shape(), var0.get_shape())
+        slot1 = mom_opt.get_slot(var1, "momentum")
+        self.assertEquals(slot1.get_shape(), var1.get_shape())
+
+        # Step 1: the momentum accumulators where 0. So we should see a normal
+        # update: v -= grad * learning_rate
+        self.evaluate(variables.global_variables_initializer())
+        self.evaluate(mom_update)
+        # Check that the momentum accumulators have been updated.
+        self.assertAllCloseAccordingToType(
+            np.array([0.1, 0.1]), self.evaluate(slot0))
+        self.assertAllCloseAccordingToType(
+            np.array([0.01, 0.01]), self.evaluate(slot1))
+        # Check that the parameters have been updated.
+        self.assertAllCloseAccordingToType(
+            np.array([1.0 - (0.1 * 2.0), 2.0 - (0.1 * 2.0)]),
+            self.evaluate(var0))
+        self.assertAllCloseAccordingToType(
+            np.array([3.0 - (0.01 * 2.0), 4.0 - (0.01 * 2.0)]),
+            self.evaluate(var1))
+        # Step 2: the momentum accumulators contain the previous update.
+        self.evaluate(mom_update)
+        if context.executing_eagerly():
+          mom_opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        # Check that the momentum accumulators have been updated.
+        self.assertAllCloseAccordingToType(
+            np.array([(0.9 * 0.1 + 0.1), (0.9 * 0.1 + 0.1)]),
+            self.evaluate(slot0))
+        self.assertAllCloseAccordingToType(
+            np.array([(0.9 * 0.01 + 0.01), (0.9 * 0.01 + 0.01)]),
+            self.evaluate(slot1))
+        # Check that the parameters have been updated.
+        self.assertAllCloseAccordingToType(
+            np.array([
+                1.0 - (0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0),
+                2.0 - (0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0)
+            ]), self.evaluate(var0))
+        self.assertAllCloseAccordingToType(
+            np.array([
+                2.98 - ((0.9 * 0.01 + 0.01) * 2.0),
+                3.98 - ((0.9 * 0.01 + 0.01) * 2.0)
+            ]), self.evaluate(var1))
+
+  def testNesterovMomentum(self):
+    for dtype in [dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        var0 = resource_variable_ops.ResourceVariable([1.0, 2.0],
+                                                      dtype=dtype,
+                                                      name="var0")
+        var1 = resource_variable_ops.ResourceVariable([3.0, 4.0],
+                                                      dtype=dtype,
+                                                      name="var1")
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        accum0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+        accum1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+        loss = 5 * var0 * var0 + 3 * var1
+        mom_op = gradient_descent.SGD(
+            learning_rate=2.0, momentum=0.9, nesterov=True)
+        opt_op = mom_op.minimize(loss, [var0, var1])
+        variables.global_variables_initializer().run()
+        for _ in range(1, 5):
+          opt_op.run()
+          var0_np, accum0_np = self._update_nesterov_momentum_numpy(
+              var0_np, accum0_np, var0_np * 10, 2.0, 0.9)
+          var1_np, accum1_np = self._update_nesterov_momentum_numpy(
+              var1_np, accum1_np, 3, 2.0, 0.9)
+          self.assertAllClose(var0_np, var0.eval())
+          self.assertAllClose(var1_np, var1.eval())
+
+  def testSparseNesterovMomentum(self):
+    for dtype in [dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        accum0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+        accum1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+        grads = []
+        for t in range(1, 5):
+          grads.append(var0_np * 10)
+          var0_np, accum0_np = self._update_nesterov_momentum_numpy(
+              var0_np, accum0_np, var0_np * 10, 2.0, 0.9)
+          var1_np, accum1_np = self._update_nesterov_momentum_numpy(
+              var1_np, accum1_np, 3, 2.0, 0.9)
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        accum0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+        accum1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+        var0 = resource_variable_ops.ResourceVariable(
+            var0_np, dtype=dtype, name="var0")
+        var1 = resource_variable_ops.ResourceVariable(
+            var1_np, dtype=dtype, name="var1")
+        mom_op = gradient_descent.SGD(
+            learning_rate=2.0, momentum=0.9, nesterov=True)
+        x_feed = array_ops.placeholder(dtype)
+        y_feed = ops.IndexedSlices(x_feed, constant_op.constant([0, 1]),
+                                   constant_op.constant([2]))
+        grads_and_vars = [(y_feed, var0),
+                          (constant_op.constant([3.0, 3.0], dtype=dtype), var1)]
+        opt_update = mom_op.apply_gradients(grads_and_vars)
+        variables.global_variables_initializer().run()
+        for t in range(1, 5):
+          opt_update.run(feed_dict={x_feed: grads[t - 1]})
+          var0_np, accum0_np = self._update_nesterov_momentum_numpy(
+              var0_np, accum0_np, var0_np * 10, 2.0, 0.9)
+          var1_np, accum1_np = self._update_nesterov_momentum_numpy(
+              var1_np, accum1_np, 3, 2.0, 0.9)
+          self.assertAllClose(var0_np, var0.eval())
+          self.assertAllClose(var1_np, var1.eval())
+
+  @test_util.run_in_graph_and_eager_modes
+  def testMinimizeSparseResourceVariable(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      # This test invokes the ResourceSparseApplyMomentum operation, which
+      # did not have a registered GPU kernel as of April 2018. With graph
+      # execution, the placement algorithm notices this and automatically
+      # places the variable in CPU (host) memory. With eager execution,
+      # the variable would be placed in GPU memory if available, which
+      # would then conflict with the future invocation of the
+      # ResourceSparseApplyMomentum operation.
+      # To work around this discrepancy, for now we force the variable
+      # to be placed on CPU.
+      with ops.device("/cpu:0"):
+        var0 = resource_variable_ops.ResourceVariable([[1.0, 2.0]], dtype=dtype)
+
+      # pylint: disable=cell-var-from-loop
+      def loss():
+        x = constant_op.constant([[4.0], [5.0]], dtype=dtype)
+        pred = math_ops.matmul(embedding_ops.embedding_lookup([var0], [0]), x)
+        return pred * pred
+
+      # pylint: enable=cell-var-from-loop
+
+      opt = gradient_descent.SGD(learning_rate=1.0, momentum=0.0)
+      sgd_op = opt.minimize(loss, [var0])
+      self.evaluate(variables.global_variables_initializer())
+      # Run 1 step of sgd
+      self.evaluate(sgd_op)
+      # Validate updated params
+      self.assertAllCloseAccordingToType([[-111, -138]], self.evaluate(var0))
+
+  @test_util.run_in_graph_and_eager_modes(reset_test=True)
+  def testMinimizeWith2DIndicesForEmbeddingLookup(self):
+    # This test invokes the ResourceSparseApplyMomentum operation, which
+    # did not have a registered GPU kernel as of April 2018. With graph
+    # execution, the placement algorithm notices this and automatically
+    # places the variable in CPU (host) memory. With eager execution,
+    # the variable would be placed in GPU memory if available, which
+    # would then conflict with the future invocation of the
+    # ResourceSparseApplyMomentum operation.
+    # To work around this discrepancy, for now we force the variable
+    # to be placed on CPU.
+    with ops.device("/cpu:0"):
+      var0 = resource_variable_ops.ResourceVariable(array_ops.ones([2, 2]))
+
+    def loss():
+      return math_ops.reduce_sum(embedding_ops.embedding_lookup(var0, [[1]]))
+
+    opt = gradient_descent.SGD(learning_rate=1.0, momentum=0.0)
+    sgd_op = opt.minimize(loss, [var0])
+    self.evaluate(variables.global_variables_initializer())
+    self.evaluate(sgd_op)
+    self.assertAllCloseAccordingToType([[1, 1], [0, 0]], self.evaluate(var0))
+
+  def testTensorLearningRateAndMomentum(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        var0 = variables.Variable([1.0, 2.0], dtype=dtype)
+        var1 = variables.Variable([3.0, 4.0], dtype=dtype)
+        grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
+        grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
+        mom_opt = gradient_descent.SGD(
+            learning_rate=constant_op.constant(2.0),
+            momentum=constant_op.constant(0.9))
+        mom_update = mom_opt.apply_gradients(
+            zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+        # Check we have slots
+        slot0 = mom_opt.get_slot(var0, "momentum")
+        self.assertEquals(slot0.get_shape(), var0.get_shape())
+        slot1 = mom_opt.get_slot(var1, "momentum")
+        self.assertEquals(slot1.get_shape(), var1.get_shape())
+
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], var0.eval())
+        self.assertAllClose([3.0, 4.0], var1.eval())
+        # Step 1: the momentum accumulators where 0. So we should see a normal
+        # update: v -= grad * learning_rate
+        mom_update.run()
+        # Check that the momentum accumulators have been updated.
+        self.assertAllCloseAccordingToType(np.array([0.1, 0.1]), slot0.eval())
+        self.assertAllCloseAccordingToType(np.array([0.01, 0.01]), slot1.eval())
+        # Check that the parameters have been updated.
+        self.assertAllCloseAccordingToType(
+            np.array([1.0 - (0.1 * 2.0), 2.0 - (0.1 * 2.0)]), var0.eval())
+        self.assertAllCloseAccordingToType(
+            np.array([3.0 - (0.01 * 2.0), 4.0 - (0.01 * 2.0)]), var1.eval())
+        # Step 2: the momentum accumulators contain the previous update.
+        mom_update.run()
+        # Check that the momentum accumulators have been updated.
+        self.assertAllCloseAccordingToType(
+            np.array([(0.9 * 0.1 + 0.1), (0.9 * 0.1 + 0.1)]), slot0.eval())
+        self.assertAllCloseAccordingToType(
+            np.array([(0.9 * 0.01 + 0.01), (0.9 * 0.01 + 0.01)]), slot1.eval())
+        # Check that the parameters have been updated.
+        self.assertAllCloseAccordingToType(
+            np.array([
+                1.0 - (0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0),
+                2.0 - (0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0)
+            ]), var0.eval())
+        self.assertAllCloseAccordingToType(
+            np.array([
+                2.98 - ((0.9 * 0.01 + 0.01) * 2.0),
+                3.98 - ((0.9 * 0.01 + 0.01) * 2.0)
+            ]), var1.eval())
+
+  def testSparse(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        var0 = variables.Variable(array_ops.zeros([4, 2], dtype=dtype))
+        var1 = variables.Variable(constant_op.constant(1.0, dtype, [4, 2]))
+        grads0 = ops.IndexedSlices(
+            constant_op.constant([[.1, .1]], dtype=dtype),
+            constant_op.constant([1]), constant_op.constant([4, 2]))
+        grads1 = ops.IndexedSlices(
+            constant_op.constant([[.01, .01], [.01, .01]], dtype=dtype),
+            constant_op.constant([2, 3]), constant_op.constant([4, 2]))
+        mom_opt = gradient_descent.SGD(learning_rate=2.0, momentum=0.9)
+        mom_update = mom_opt.apply_gradients(
+            zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+
+        # Check we have slots
+        slot0 = mom_opt.get_slot(var0, "momentum")
+        self.assertEquals(slot0.get_shape(), var0.get_shape())
+        slot1 = mom_opt.get_slot(var1, "momentum")
+        self.assertEquals(slot1.get_shape(), var1.get_shape())
+
+        # Fetch params to validate initial values
+        self.assertAllClose([0, 0], var0.eval()[0])
+        self.assertAllClose([0, 0], var0.eval()[1])
+        self.assertAllClose([1, 1], var1.eval()[2])
+
+        # Step 1: the momentum accumulators are 0. So we should see a normal
+        # update: v -= grad * learning_rate
+        mom_update.run()
+        # Check that the momentum accumulators have been updated.
+        self.assertAllCloseAccordingToType(np.array([0, 0]), slot0.eval()[0])
+        self.assertAllCloseAccordingToType(np.array([.1, .1]), slot0.eval()[1])
+        self.assertAllCloseAccordingToType(
+            np.array([.01, .01]),
+            slot1.eval()[2])
+        # Check that the parameters have been updated.
+        self.assertAllCloseAccordingToType(np.array([0, 0]), var0.eval()[0])
+        self.assertAllCloseAccordingToType(
+            np.array([-(0.1 * 2.0), -(0.1 * 2.0)]),
+            var0.eval()[1])
+        self.assertAllCloseAccordingToType(
+            np.array([1.0 - (0.01 * 2.0), 1.0 - (0.01 * 2.0)]),
+            var1.eval()[2])
+        # Step 2: the momentum accumulators contain the previous update.
+        mom_update.run()
+        # Check that the momentum accumulators have been updated.
+        self.assertAllClose(np.array([0, 0]), slot0.eval()[0])
+        self.assertAllCloseAccordingToType(
+            np.array([(0.9 * 0.1 + 0.1), (0.9 * 0.1 + 0.1)]),
+            slot0.eval()[1])
+        self.assertAllCloseAccordingToType(
+            np.array([(0.9 * 0.01 + 0.01), (0.9 * 0.01 + 0.01)]),
+            slot1.eval()[2])
+        # Check that the parameters have been updated.
+        self.assertAllClose(np.array([0, 0]), var0.eval()[0])
+        self.assertAllCloseAccordingToType(
+            np.array([
+                -(0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0),
+                -(0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0)
+            ]),
+            var0.eval()[1])
+        self.assertAllCloseAccordingToType(
+            np.array([
+                0.98 - ((0.9 * 0.01 + 0.01) * 2.0),
+                0.98 - ((0.9 * 0.01 + 0.01) * 2.0)
+            ]),
+            var1.eval()[2])
+
+  def testSharing(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        var0 = variables.Variable([1.0, 2.0], dtype=dtype)
+        var1 = variables.Variable([3.0, 4.0], dtype=dtype)
+        grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
+        grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
+        mom_opt = gradient_descent.SGD(learning_rate=2.0, momentum=0.9)
+        mom_update1 = mom_opt.apply_gradients(
+            zip([grads0, grads1], [var0, var1]))
+        mom_update2 = mom_opt.apply_gradients(
+            zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+
+        slot0 = mom_opt.get_slot(var0, "momentum")
+        self.assertEquals(slot0.get_shape(), var0.get_shape())
+        slot1 = mom_opt.get_slot(var1, "momentum")
+        self.assertEquals(slot1.get_shape(), var1.get_shape())
+
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], var0.eval())
+        self.assertAllClose([3.0, 4.0], var1.eval())
+        # Step 1: the momentum accumulators where 0. So we should see a normal
+        # update: v -= grad * learning_rate
+        mom_update1.run()
+        # Check that the momentum accumulators have been updated.
+        self.assertAllCloseAccordingToType(np.array([0.1, 0.1]), slot0.eval())
+        self.assertAllCloseAccordingToType(np.array([0.01, 0.01]), slot1.eval())
+        # Check that the parameters have been updated.
+        self.assertAllCloseAccordingToType(
+            np.array([1.0 - (0.1 * 2.0), 2.0 - (0.1 * 2.0)]), var0.eval())
+        self.assertAllCloseAccordingToType(
+            np.array([3.0 - (0.01 * 2.0), 4.0 - (0.01 * 2.0)]), var1.eval())
+        # Step 2: the second momentum accumulators contain the previous update.
+        mom_update2.run()
+        # Check that the momentum accumulators have been updated.
+        self.assertAllCloseAccordingToType(
+            np.array([(0.9 * 0.1 + 0.1), (0.9 * 0.1 + 0.1)]), slot0.eval())
+        self.assertAllCloseAccordingToType(
+            np.array([(0.9 * 0.01 + 0.01), (0.9 * 0.01 + 0.01)]), slot1.eval())
+        # Check that the parameters have been updated.
+        self.assertAllCloseAccordingToType(
+            np.array([
+                1.0 - (0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0),
+                2.0 - (0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0)
+            ]), var0.eval())
+        self.assertAllCloseAccordingToType(
+            np.array([
+                2.98 - ((0.9 * 0.01 + 0.01) * 2.0),
+                3.98 - ((0.9 * 0.01 + 0.01) * 2.0)
+            ]), var1.eval())
+
+  @test_util.run_in_graph_and_eager_modes
+  def testConfig(self):
+    with self.cached_session():
+      opt = gradient_descent.SGD(learning_rate=1.0, momentum=0.9, nesterov=True)
+      config = opt.get_config()
+      opt2 = gradient_descent.SGD.from_config(config)
+      # assert both are equal float values.
+      self.assertEqual(
+          opt._get_hyper("learning_rate"), opt2._get_hyper("learning_rate"))
+      self.assertEqual(opt._get_hyper("momentum"), opt2._get_hyper("momentum"))
+      # self.assertEqual(opt._get_hyper("decay"), opt2._get_hyper("decay"))
+      var0 = variables.Variable([[1.0], [2.0]], dtype=dtypes.float32)
+      loss = lambda: 3 * var0
+      # learning rate variable created when calling minimize.
+      opt.minimize(loss, [var0])
+      self.evaluate(variables.global_variables_initializer())
+      config = opt.get_config()
+      opt3 = gradient_descent.SGD.from_config(config)
+      self.assertEqual(
+          self.evaluate(opt._get_hyper("learning_rate")),
+          opt3._get_hyper("learning_rate"))
+      self.assertEqual(
+          self.evaluate(opt._get_hyper("momentum")),
+          opt3._get_hyper("momentum"))
+      # self.assertEqual(
+      #     self.evaluate(opt._get_hyper("decay")), opt3._get_hyper("decay"))
+      self.assertTrue(opt3._nesterov)
+
+  def testNesterovWithoutMomentum(self):
+    with self.assertRaisesRegexp(ValueError, "must be between"):
+      gradient_descent.SGD(learning_rate=1.0, momentum=2.0)
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/keras/optimizer_v2/optimizer_v2.py b/tensorflow/python/keras/optimizer_v2/optimizer_v2.py
index 26e6dc294c0..c6e1d57c5e4 100644
--- a/tensorflow/python/keras/optimizer_v2/optimizer_v2.py
+++ b/tensorflow/python/keras/optimizer_v2/optimizer_v2.py
@@ -31,7 +31,6 @@ from tensorflow.python.framework import ops
 from tensorflow.python.keras import backend
 from tensorflow.python.keras import initializers
 from tensorflow.python.keras.engine import base_layer
-from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gradients
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
@@ -324,9 +323,12 @@ class OptimizerV2(optimizer_v1.Optimizer):
                       "_" + var.op.name)
         with ops.name_scope("update" + scope_name), ops.colocate_with(var):
           update_ops.append(update_grad_to_var(grad, var))
-      with ops.colocate_with(self._iterations):
-        update_ops.append(self._iterations.assign_add(1))
-      return control_flow_ops.group(*update_ops)
+      # control dependencies does not work in per replica mode, please change
+      # this once b/118841692 is fixed.
+      # with ops.control_dependencies(update_ops):
+      #   apply_updates = self._iterations.assign_add(1).op
+      apply_updates = merge_update_step(update_ops, self.iteration)
+      return apply_updates
 
   def _set_hyper(self, name, value):
     """set hyper `name` to value. value can be callable, tensor, numeric."""
@@ -344,8 +346,26 @@ class OptimizerV2(optimizer_v1.Optimizer):
     value = self._hyper[name]
     return self._call_if_callable(value)
 
+  def __getattribute__(self, name):
+    """Overridden to support hyperparameter access."""
+    try:
+      return super(OptimizerV2, self).__getattribute__(name)
+    except AttributeError as e:
+      # Needed to avoid infinite recursion with __setattr__.
+      if name == "_hyper":
+        raise e
+      # Backwards compatibility with Keras optimizers.
+      if name == "lr":
+        name = "learning_rate"
+      if name in self._hyper:
+        return self._hyper[name]
+      raise e
+
   def __setattr__(self, name, value):
     """Override setattr to support dynamic hyperparameter setting."""
+    # Backwards compatibility with Keras optimizers.
+    if name == "lr":
+      name = "learning_rate"
     if hasattr(self, "_hyper") and name in self._hyper:
       self._set_hyper(name, value)
     else:
@@ -521,7 +541,7 @@ def _filter_grads(grads_and_vars):
   filtered = tuple(filtered)
   if not filtered:
     raise ValueError("No gradients provided for any variable: %s." %
-                     ([v.name for _, v in filtered],))
+                     ([v.name for _, v in grads_and_vars],))
   if vars_with_empty_grads:
     logging.warning(
         ("Gradients does not exist for variables %s when minimizing the loss."),
@@ -529,6 +549,21 @@ def _filter_grads(grads_and_vars):
   return filtered
 
 
+def merge_update_step(update_ops, local_step):
+  """Merge local step counter update from different replicas."""
+
+  def merge_update_step_fn(strategy, update_ops, local_step):
+    merged_ops = []
+    for update_op in update_ops:
+      merged_ops.append(strategy.group(update_op))
+    with ops.control_dependencies(merged_ops):
+      incre_op = local_step.assign_add(1).op
+    return incre_op
+
+  return distribution_strategy_context.get_replica_context().merge_call(
+      merge_update_step_fn, update_ops, local_step)
+
+
 def merge_grads(grads_and_vars):
   """Merge gradients from different replicas."""
 
@@ -537,7 +572,7 @@ def merge_grads(grads_and_vars):
         variable_scope.VariableAggregation.MEAN, grads_and_vars)
     return reduced_grads
 
-  return distribution_strategy_context.get_tower_context().merge_call(
+  return distribution_strategy_context.get_replica_context().merge_call(
       merge_grad_fn, grads_and_vars)
 
 
diff --git a/tensorflow/python/keras/optimizer_v2/optimizer_v2_test.py b/tensorflow/python/keras/optimizer_v2/optimizer_v2_test.py
index e5d1a104ca4..682deda23f0 100644
--- a/tensorflow/python/keras/optimizer_v2/optimizer_v2_test.py
+++ b/tensorflow/python/keras/optimizer_v2/optimizer_v2_test.py
@@ -370,6 +370,30 @@ class OptimizerTest(test.TestCase):
       self.assertAllClose(
           self.evaluate([var3, var4]), self.evaluate([var5, var6]))
 
+  @test_util.run_in_graph_and_eager_modes
+  def testGettingHyperParameters(self):
+    opt = adam.Adam(learning_rate=1.0)
+    var = resource_variable_ops.ResourceVariable([1.0, 2.0],
+                                                 dtype=dtypes.float32)
+    loss = lambda: 3 * var
+    opt_op = opt.minimize(loss, [var])
+    self.evaluate(variables.global_variables_initializer())
+    self.evaluate(opt_op)
+
+    lr = self.evaluate(opt.lr)
+    self.assertEqual(1.0, lr)
+
+    opt.lr = 2.0
+    lr = self.evaluate(opt.lr)
+    self.assertEqual(2.0, lr)
+
+    self.evaluate(opt.lr.assign(3.0))
+    lr = self.evaluate(opt.lr)
+    self.assertEqual(3.0, lr)
+
+    with self.assertRaises(AttributeError):
+      opt.not_an_attr += 3
+
   def testOptimizerWithFunction(self):
     with context.eager_mode():
       var = resource_variable_ops.ResourceVariable([1.0, 2.0],
diff --git a/tensorflow/python/keras/utils/__init__.py b/tensorflow/python/keras/utils/__init__.py
index c442b311160..8939044f71d 100644
--- a/tensorflow/python/keras/utils/__init__.py
+++ b/tensorflow/python/keras/utils/__init__.py
@@ -23,11 +23,13 @@ from tensorflow.python.keras.utils.data_utils import get_file
 from tensorflow.python.keras.utils.data_utils import OrderedEnqueuer
 from tensorflow.python.keras.utils.data_utils import Sequence
 from tensorflow.python.keras.utils.data_utils import SequenceEnqueuer
+from tensorflow.python.keras.utils.generic_utils import class_and_config_for_serialized_keras_object
 from tensorflow.python.keras.utils.generic_utils import custom_object_scope
 from tensorflow.python.keras.utils.generic_utils import CustomObjectScope
 from tensorflow.python.keras.utils.generic_utils import deserialize_keras_object
 from tensorflow.python.keras.utils.generic_utils import get_custom_objects
 from tensorflow.python.keras.utils.generic_utils import Progbar
+from tensorflow.python.keras.utils.generic_utils import serialize_keras_class_and_config
 from tensorflow.python.keras.utils.generic_utils import serialize_keras_object
 from tensorflow.python.keras.utils.io_utils import HDF5Matrix
 from tensorflow.python.keras.utils.layer_utils import convert_all_kernels_in_model
diff --git a/tensorflow/python/keras/utils/generic_utils.py b/tensorflow/python/keras/utils/generic_utils.py
index 5af82f36911..375bd9d196c 100644
--- a/tensorflow/python/keras/utils/generic_utils.py
+++ b/tensorflow/python/keras/utils/generic_utils.py
@@ -125,22 +125,48 @@ def get_custom_objects():
   return _GLOBAL_CUSTOM_OBJECTS
 
 
+def serialize_keras_class_and_config(cls_name, cls_config):
+  """Returns the serialization of the class with the given config."""
+  return {'class_name': cls_name, 'config': cls_config}
+
+
 @tf_export('keras.utils.serialize_keras_object')
 def serialize_keras_object(instance):
   _, instance = tf_decorator.unwrap(instance)
   if instance is None:
     return None
   if hasattr(instance, 'get_config'):
-    return {
-        'class_name': instance.__class__.__name__,
-        'config': instance.get_config()
-    }
+    return serialize_keras_class_and_config(instance.__class__.__name__,
+                                            instance.get_config())
   if hasattr(instance, '__name__'):
     return instance.__name__
   else:
     raise ValueError('Cannot serialize', instance)
 
 
+def class_and_config_for_serialized_keras_object(
+    config,
+    module_objects=None,
+    custom_objects=None,
+    printable_module_name='object'):
+  """Returns the class name and config for a serialized keras object."""
+  if (not isinstance(config, dict) or 'class_name' not in config or
+      'config' not in config):
+    raise ValueError('Improper config format: ' + str(config))
+
+  class_name = config['class_name']
+  if custom_objects and class_name in custom_objects:
+    cls = custom_objects[class_name]
+  elif class_name in _GLOBAL_CUSTOM_OBJECTS:
+    cls = _GLOBAL_CUSTOM_OBJECTS[class_name]
+  else:
+    module_objects = module_objects or {}
+    cls = module_objects.get(class_name)
+    if cls is None:
+      raise ValueError('Unknown ' + printable_module_name + ': ' + class_name)
+  return (cls, config['config'])
+
+
 @tf_export('keras.utils.deserialize_keras_object')
 def deserialize_keras_object(identifier,
                              module_objects=None,
@@ -151,37 +177,28 @@ def deserialize_keras_object(identifier,
   if isinstance(identifier, dict):
     # In this case we are dealing with a Keras config dictionary.
     config = identifier
-    if 'class_name' not in config or 'config' not in config:
-      raise ValueError('Improper config format: ' + str(config))
-    class_name = config['class_name']
-    if custom_objects and class_name in custom_objects:
-      cls = custom_objects[class_name]
-    elif class_name in _GLOBAL_CUSTOM_OBJECTS:
-      cls = _GLOBAL_CUSTOM_OBJECTS[class_name]
-    else:
-      module_objects = module_objects or {}
-      cls = module_objects.get(class_name)
-      if cls is None:
-        raise ValueError('Unknown ' + printable_module_name + ': ' + class_name)
+    (cls, cls_config) = class_and_config_for_serialized_keras_object(
+        config, module_objects, custom_objects, printable_module_name)
+
     if hasattr(cls, 'from_config'):
       arg_spec = tf_inspect.getfullargspec(cls.from_config)
       custom_objects = custom_objects or {}
 
       if 'custom_objects' in arg_spec.args:
         return cls.from_config(
-            config['config'],
+            cls_config,
             custom_objects=dict(
                 list(_GLOBAL_CUSTOM_OBJECTS.items()) +
                 list(custom_objects.items())))
       with CustomObjectScope(custom_objects):
-        return cls.from_config(config['config'])
+        return cls.from_config(cls_config)
     else:
       # Then `cls` may be a function returning a class.
       # in this case by convention `config` holds
       # the kwargs of the function.
       custom_objects = custom_objects or {}
       with CustomObjectScope(custom_objects):
-        return cls(**config['config'])
+        return cls(**cls_config)
   elif isinstance(identifier, six.string_types):
     function_name = identifier
     if custom_objects and function_name in custom_objects:
diff --git a/tensorflow/python/kernel_tests/BUILD b/tensorflow/python/kernel_tests/BUILD
index 81980b95f4e..6b990d3a926 100644
--- a/tensorflow/python/kernel_tests/BUILD
+++ b/tensorflow/python/kernel_tests/BUILD
@@ -118,6 +118,7 @@ cuda_py_test(
     size = "small",
     srcs = ["list_ops_test.py"],
     additional_deps = [
+        "@absl_py//absl/testing:parameterized",
         "//third_party/py/numpy",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:math_ops",
@@ -1053,9 +1054,9 @@ tf_py_test(
 )
 
 tf_py_test(
-    name = "summary_ops_test",
+    name = "summary_v1_ops_test",
     size = "small",
-    srcs = ["summary_ops_test.py"],
+    srcs = ["summary_v1_ops_test.py"],
     additional_deps = [
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:client_testlib",
@@ -1066,9 +1067,9 @@ tf_py_test(
 )
 
 tf_py_test(
-    name = "summary_tensor_op_test",
+    name = "summary_v1_tensor_op_test",
     size = "small",
-    srcs = ["summary_tensor_op_test.py"],
+    srcs = ["summary_v1_tensor_op_test.py"],
     additional_deps = [
         "//third_party/py/numpy",
         "@six_archive//:six",
@@ -1077,7 +1078,7 @@ tf_py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework",
         "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:summary_ops",
+        "//tensorflow/python:summary",
     ],
 )
 
@@ -2321,9 +2322,9 @@ cuda_py_test(
 )
 
 cuda_py_test(
-    name = "summary_audio_op_test",
+    name = "summary_v1_audio_op_test",
     size = "small",
-    srcs = ["summary_audio_op_test.py"],
+    srcs = ["summary_v1_audio_op_test.py"],
     additional_deps = [
         "//third_party/py/numpy",
         "//tensorflow/core:protos_all_py",
@@ -2334,9 +2335,9 @@ cuda_py_test(
 )
 
 cuda_py_test(
-    name = "summary_image_op_test",
+    name = "summary_v1_image_op_test",
     size = "small",
-    srcs = ["summary_image_op_test.py"],
+    srcs = ["summary_v1_image_op_test.py"],
     additional_deps = [
         "//third_party/py/numpy",
         "//tensorflow/core:protos_all_py",
@@ -2562,6 +2563,8 @@ cuda_py_test(
     ],
     shard_count = 4,
     tags = [
+        # TODO(b/118887316): Re-enable this test in Kokoro.
+        "no_oss",
         "optonly",  # times out
     ],
 )
@@ -2885,7 +2888,7 @@ cuda_py_test(
         "//tensorflow/python:nn_grad",
         "//tensorflow/python:nn_ops",
     ],
-    shard_count = 20,
+    shard_count = 30,
 )
 
 cuda_py_test(
diff --git a/tensorflow/python/kernel_tests/array_ops_test.py b/tensorflow/python/kernel_tests/array_ops_test.py
index 9d2c2362457..c90794c7892 100644
--- a/tensorflow/python/kernel_tests/array_ops_test.py
+++ b/tensorflow/python/kernel_tests/array_ops_test.py
@@ -17,6 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import re
 import time
 import unittest
 
@@ -634,13 +635,21 @@ class StridedSliceTest(test_util.TensorFlowTestCase):
       bar2 = constant_op.constant(3)
       _ = checker[..., bar:bar2]
       _ = checker[..., bar]
-      with self.assertRaisesRegexp(
-          TypeError,
-          "Value passed to parameter 'begin' has DataType float32 not in "
-          "list of allowed values"):
-        _ = checker[..., 3.0]
       _ = checker[..., 3]
 
+  def testTensorIndexingTypeError(self):
+    with self.session(use_gpu=True):
+      checker = StridedSliceChecker(self, StridedSliceChecker.REF_TENSOR)
+      expected = re.escape(array_ops._SLICE_TYPE_ERROR)
+      with self.assertRaisesRegexp(TypeError, expected):
+        _ = checker["foo"]
+      with self.assertRaisesRegexp(TypeError, expected):
+        _ = checker[constant_op.constant("foo")]
+      with self.assertRaisesRegexp(TypeError, expected):
+        _ = checker[0.0]
+      with self.assertRaisesRegexp(TypeError, expected):
+        _ = checker[constant_op.constant(0.0)]
+
   def testExpand(self):
     with self.session(use_gpu=True):
       raw = [[[[[1, 2, 4, 5], [5, 6, 7, 8], [9, 10, 11, 12]]],
diff --git a/tensorflow/python/kernel_tests/boosted_trees/quantile_ops_test.py b/tensorflow/python/kernel_tests/boosted_trees/quantile_ops_test.py
index e0d46bae83a..12afb6a2ad8 100644
--- a/tensorflow/python/kernel_tests/boosted_trees/quantile_ops_test.py
+++ b/tensorflow/python/kernel_tests/boosted_trees/quantile_ops_test.py
@@ -18,14 +18,21 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import os
+import tempfile
+
+import numpy as np
+
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import boosted_trees_ops
 from tensorflow.python.ops import resources
 from tensorflow.python.ops.gen_boosted_trees_ops import boosted_trees_quantile_stream_resource_handle_op as resource_handle_op
 from tensorflow.python.ops.gen_boosted_trees_ops import is_boosted_trees_quantile_stream_resource_initialized as resource_initialized
 from tensorflow.python.platform import googletest
+from tensorflow.python.training import saver
 
 
 class QuantileOpsTest(test_util.TensorFlowTestCase):
@@ -57,18 +64,16 @@ class QuantileOpsTest(test_util.TensorFlowTestCase):
     | 5        |     1            |   2.2     |   0.8
     """
 
-    self._feature_0 = constant_op.constant(
-        [[1.2], [12.1], [0.3], [0.5], [0.6], [2.2]], dtype=dtypes.float32)
-    self._feature_1 = constant_op.constant(
-        [[2.3], [1.2], [1.1], [2.6], [3.2], [0.8]], dtype=dtypes.float32)
-    self._feature_0_boundaries = constant_op.constant(
-        [0.3, 0.6, 1.2, 12.1], dtype=dtypes.float32)
-    self._feature_1_boundaries = constant_op.constant(
-        [0.8, 1.2, 2.3, 3.2], dtype=dtypes.float32)
-    self._feature_0_quantiles = constant_op.constant(
-        [[2], [3], [0], [1], [1], [3]], dtype=dtypes.int32)
-    self._feature_1_quantiles = constant_op.constant(
-        [[2], [1], [1], [3], [3], [0]], dtype=dtypes.int32)
+    self._feature_0 = constant_op.constant([1.2, 12.1, 0.3, 0.5, 0.6, 2.2],
+                                           dtype=dtypes.float32)
+    self._feature_1 = constant_op.constant([2.3, 1.2, 1.1, 2.6, 3.2, 0.8],
+                                           dtype=dtypes.float32)
+    self._feature_0_boundaries = np.array([0.3, 0.6, 1.2, 12.1])
+    self._feature_1_boundaries = np.array([0.8, 1.2, 2.3, 3.2])
+    self._feature_0_quantiles = constant_op.constant([2, 3, 0, 1, 1, 3],
+                                                     dtype=dtypes.int32)
+    self._feature_1_quantiles = constant_op.constant([2, 1, 1, 3, 3, 0],
+                                                     dtype=dtypes.int32)
 
     self._example_weights = constant_op.constant(
         [10, 1, 1, 1, 1, 1], dtype=dtypes.float32)
@@ -135,6 +140,69 @@ class QuantileOpsTest(test_util.TensorFlowTestCase):
       self.assertAllClose(self._feature_0_quantiles, quantiles[0].eval())
       self.assertAllClose(self._feature_1_quantiles, quantiles[1].eval())
 
+  def testSaveRestoreAfterFlush(self):
+    save_dir = os.path.join(self.get_temp_dir(), "save_restore")
+    save_path = os.path.join(tempfile.mkdtemp(prefix=save_dir), "hash")
+
+    with self.test_session() as sess:
+      accumulator = boosted_trees_ops.QuantileAccumulator(
+          num_streams=2, num_quantiles=3, epsilon=self.eps, name="q0")
+
+      save = saver.Saver()
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      buckets = accumulator.get_bucket_boundaries()
+      self.assertAllClose([], buckets[0].eval())
+      self.assertAllClose([], buckets[1].eval())
+      summaries = accumulator.add_summaries([self._feature_0, self._feature_1],
+                                            self._example_weights)
+      with ops.control_dependencies([summaries]):
+        flush = accumulator.flush()
+      sess.run(flush)
+      self.assertAllClose(self._feature_0_boundaries, buckets[0].eval())
+      self.assertAllClose(self._feature_1_boundaries, buckets[1].eval())
+      save.save(sess, save_path)
+
+    with self.test_session(graph=ops.Graph()) as sess:
+      accumulator = boosted_trees_ops.QuantileAccumulator(
+          num_streams=2, num_quantiles=3, epsilon=self.eps, name="q0")
+      save = saver.Saver()
+      save.restore(sess, save_path)
+      buckets = accumulator.get_bucket_boundaries()
+      self.assertAllClose(self._feature_0_boundaries, buckets[0].eval())
+      self.assertAllClose(self._feature_1_boundaries, buckets[1].eval())
+
+  def testSaveRestoreBeforeFlush(self):
+    save_dir = os.path.join(self.get_temp_dir(), "save_restore")
+    save_path = os.path.join(tempfile.mkdtemp(prefix=save_dir), "hash")
+
+    with self.test_session() as sess:
+      accumulator = boosted_trees_ops.QuantileAccumulator(
+          num_streams=2, num_quantiles=3, epsilon=self.eps, name="q0")
+
+      save = saver.Saver()
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      summaries = accumulator.add_summaries([self._feature_0, self._feature_1],
+                                            self._example_weights)
+      sess.run(summaries)
+      buckets = accumulator.get_bucket_boundaries()
+      self.assertAllClose([], buckets[0].eval())
+      self.assertAllClose([], buckets[1].eval())
+      save.save(sess, save_path)
+      sess.run(accumulator.flush())
+      self.assertAllClose(self._feature_0_boundaries, buckets[0].eval())
+      self.assertAllClose(self._feature_1_boundaries, buckets[1].eval())
+
+    with self.test_session(graph=ops.Graph()) as sess:
+      accumulator = boosted_trees_ops.QuantileAccumulator(
+          num_streams=2, num_quantiles=3, epsilon=self.eps, name="q0")
+      save = saver.Saver()
+      save.restore(sess, save_path)
+      buckets = accumulator.get_bucket_boundaries()
+      self.assertAllClose([], buckets[0].eval())
+      self.assertAllClose([], buckets[1].eval())
+
 
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/python/kernel_tests/control_flow_ops_py_test.py b/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
index 3a5d817e9d6..19b067e4499 100644
--- a/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
+++ b/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
@@ -934,7 +934,6 @@ class ControlFlowTest(test.TestCase):
       r = isum(s, maximum_iterations=3)
       self.assertAllEqual([1 + 3, 2 + 3, 3 + 3, 4 + 3, 5 + 3], r.eval())
 
-  @test_util.disable_control_flow_v2("b/115776323 (max_iters)")
   def testWhileWithMaximumIterationsAndSingleArgument(self):
     with self.cached_session():
       r = control_flow_ops.while_loop(
@@ -967,7 +966,6 @@ class ControlFlowTest(test.TestCase):
     # Should execute without issue.
     self.assertEqual(3, self.evaluate(loop_execute))
 
-  @test_util.disable_control_flow_v2("b/115776323 (max_iters)")
   def testInvalidMaximumIterationsWhileLoopGradientInXLAContext(self):
     v = constant_op.constant(1.0)
 
@@ -989,29 +987,45 @@ class ControlFlowTest(test.TestCase):
     gs = gradients_impl.gradients(loop_no_xla, v)
     self.evaluate(gs)  # This should execute without error.
 
-    xla_context = control_flow_ops.XLAControlFlowContext()
-    xla_context.Enter()
-    loop_no_maxiter = create_while_loop()
-    loop_with_maxiter = create_while_loop(maximum_iterations=2)
-    xla_context.Exit()
+    if control_flow_ops.ENABLE_WHILE_V2:
+      xla_context = control_flow_ops.XLAControlFlowContext()
+      xla_context.Enter()
+      with self.assertRaisesRegexp(
+          ValueError,
+          r"maximum_iterations is None. It is required and must be statically "
+          r"known \(e.g. a constant value or known shape dimension\) when "
+          r"building while_loop in XLA context."):
+        loop_no_maxiter = create_while_loop()
+      with self.assertRaisesRegexp(
+          ValueError,
+          r"maximum_iterations must be statically "
+          r"known \(e.g. a constant value or known shape dimension\) when "
+          r"building while_loop in XLA context."):
+        loop_with_maxiter = create_while_loop(maximum_iterations=2)
+      xla_context.Exit()
+    else:
+      xla_context = control_flow_ops.XLAControlFlowContext()
+      xla_context.Enter()
+      loop_no_maxiter = create_while_loop()
+      loop_with_maxiter = create_while_loop(maximum_iterations=2)
+      xla_context.Exit()
 
-    with self.assertRaisesRegexp(
-        ValueError,
-        r"Cannot create a gradient accumulator for tensor '.+' inside "
-        r"XLA while_loop because maximum_iterations was not passed to "
-        r"the tf.while_loop call \('.+'\)."):
-      _ = gradients_impl.gradients(loop_no_maxiter, v)
+      with self.assertRaisesRegexp(
+          ValueError,
+          r"Cannot create a gradient accumulator for tensor '.+' inside "
+          r"XLA while_loop because maximum_iterations was not passed to "
+          r"the tf.while_loop call \('.+'\)."):
+        _ = gradients_impl.gradients(loop_no_maxiter, v)
 
-    with self.assertRaisesRegexp(
-        ValueError,
-        r"Cannot create a gradient accumulator for tensor '.+' inside XLA "
-        r"while_loop. maximum_iterations tensor '.+' for while_loop context "
-        r"'.+' must be statically known \(e.g. a constant value or known "
-        r"shape dimension\), or be defined at or outside the while loop "
-        r"context '.*' \(currently defined in '.*'\)"):
-      _ = gradients_impl.gradients(loop_with_maxiter, v)
+      with self.assertRaisesRegexp(
+          ValueError,
+          r"Cannot create a gradient accumulator for tensor '.+' inside XLA "
+          r"while_loop. maximum_iterations tensor '.+' for while_loop context "
+          r"'.+' must be statically known \(e.g. a constant value or known "
+          r"shape dimension\), or be defined at or outside the while loop "
+          r"context '.*' \(currently defined in '.*'\)"):
+        _ = gradients_impl.gradients(loop_with_maxiter, v)
 
-  @test_util.disable_control_flow_v2("b/115776323 (max_iters)")
   def testInvalidMaximumIterationsFromSiblingContextWhileLoopInXLAContext(self):
     v = constant_op.constant(1.0)
 
@@ -1030,19 +1044,29 @@ class ControlFlowTest(test.TestCase):
           lambda i, x: (i + 1, v * x), (0, 1.0),
           maximum_iterations=max_iter_holder[0])
 
-    xla_context = control_flow_ops.XLAControlFlowContext()
-    xla_context.Enter()
-    loop = create_while_loop()
-    xla_context.Exit()
-
-    with self.assertRaisesRegexp(
-        ValueError,
-        r"Cannot create a gradient accumulator for tensor '.+' inside XLA "
-        r"while_loop. maximum_iterations tensor '.*Placeholder:0' for "
-        r"while_loop context '.+' must be statically known \(e.g. a constant "
-        r"value or known shape dimension\), or be defined at or outside the "
-        r"while loop context '' \(currently defined in 'cond/.+'\)"):
-      _ = gradients_impl.gradients(loop, v)
+    if control_flow_ops.ENABLE_WHILE_V2:
+      xla_context = control_flow_ops.XLAControlFlowContext()
+      xla_context.Enter()
+      with self.assertRaisesRegexp(
+          ValueError,
+          r"maximum_iterations must be statically known \(e.g. a constant value"
+          r" or known shape dimension\) when building while_loop in XLA "
+          r"context."):
+        loop = create_while_loop()
+      xla_context.Exit()
+    else:
+      xla_context = control_flow_ops.XLAControlFlowContext()
+      xla_context.Enter()
+      loop = create_while_loop()
+      xla_context.Exit()
+      with self.assertRaisesRegexp(
+          ValueError,
+          r"Cannot create a gradient accumulator for tensor '.+' inside XLA "
+          r"while_loop. maximum_iterations tensor '.*Placeholder:0' for "
+          r"while_loop context '.+' must be statically known \(e.g. a constant "
+          r"value or known shape dimension\), or be defined at or outside the "
+          r"while loop context '' \(currently defined in 'cond/.+'\)"):
+        _ = gradients_impl.gradients(loop, v)
 
   @test_util.disable_control_flow_v2("b/118457764")
   def testNestedWhileLoopWithMaxItersFromOuterContextInXLAContext(self):
@@ -3154,6 +3178,39 @@ class ControlFlowTest(test.TestCase):
                 ]), 1)
 
 
+  def testQIntSwitchMerge(self):
+    with self.cached_session(force_gpu=test.is_gpu_available()) as sess:
+      constant_qint = constant_op.constant(np.array([42]), dtypes.qint8)
+      cond = constant_op.constant(True, dtypes.bool)
+      v_f, v_t = control_flow_ops.switch(constant_qint, cond)
+      result = control_flow_ops.merge([v_f, v_t])
+      sess.run(result)
+
+  def testQIntRefSwitchMerge(self):
+    with self.cached_session(use_gpu=test.is_gpu_available()) as sess:
+      var_qint = gen_state_ops.variable(
+          shape=[1], dtype=dtypes.qint8, name="v", container="", shared_name="")
+      assign_op = state_ops.assign(
+          var_qint, constant_op.constant(np.array([42]), dtypes.qint8))
+      sess.run(assign_op)
+
+      cond = constant_op.constant(True, dtypes.bool)
+      v_f, v_t = control_flow_ops.ref_switch(var_qint, cond)
+      result = control_flow_ops.ref_merge([v_f, v_t])
+      sess.run(result)
+
+  def testQIntArgAndRet(self):
+
+    @function.Defun(dtypes.qint8)
+    def func(x):
+      return x
+
+    with self.cached_session(force_gpu=test.is_gpu_available()) as sess:
+      qint = constant_op.constant(np.array([42]), dtypes.qint8)
+      result = func(qint)
+      sess.run(result)
+
+
 class ControlFlowContextCheckTest(test.TestCase):
 
   def _getWhileTensor(self):
diff --git a/tensorflow/python/kernel_tests/conv_ops_3d_test.py b/tensorflow/python/kernel_tests/conv_ops_3d_test.py
index 57b09dc167f..c4a9cdcf8e0 100644
--- a/tensorflow/python/kernel_tests/conv_ops_3d_test.py
+++ b/tensorflow/python/kernel_tests/conv_ops_3d_test.py
@@ -638,6 +638,30 @@ class Conv3DTest(test.TestCase):
         padding="SAME",
         test_input=False)
 
+  # Test the fast path in gemm_pack_rhs/mkldnn_gemm_pack, when channel
+  # dimension is a multiple of packet size.
+  def testInputGradientValidPaddingStrideOneFastPath(self):
+    self.ConstructAndTestGradient(
+        batch=2,
+        input_shape=(3, 5, 4),
+        filter_shape=(2, 2, 2),
+        in_depth=8,
+        out_depth=2,
+        stride=1,
+        padding="VALID",
+        test_input=True)
+
+  def testFilterGradientValidPaddingStrideOneFastPath(self):
+    self.ConstructAndTestGradient(
+        batch=2,
+        input_shape=(4, 6, 5),
+        filter_shape=(2, 2, 2),
+        in_depth=8,
+        out_depth=2,
+        stride=1,
+        padding="VALID",
+        test_input=False)
+
   # Testing for backprops
   def _RunAndVerifyBackprop(self, input_sizes, filter_sizes, output_sizes,
                             strides, dilations, padding, data_format, use_gpu,
diff --git a/tensorflow/python/kernel_tests/functional_ops_test.py b/tensorflow/python/kernel_tests/functional_ops_test.py
index f35450b6fd6..04c1032722c 100644
--- a/tensorflow/python/kernel_tests/functional_ops_test.py
+++ b/tensorflow/python/kernel_tests/functional_ops_test.py
@@ -829,6 +829,49 @@ class FunctionalOpsTest(test.TestCase):
           self.assertAllEqual(5050.,
                               sess.run([result, c], feed_dict={n: 100.})[0])
 
+  # pylint: disable=cell-var-from-loop
+  def testWhileCapturedInputs(self):
+    for use_gpu in (True, False):
+      with ops.Graph().as_default() as g:
+        v = variables.Variable(1.0)
+
+        def TestCond(n, *args):
+          del args
+          return n < 10
+
+        @function.Defun(*[dtypes.float32] * 2)
+        def TestUnary(n, x):
+          return math_ops.add(n, 1), x + n + v
+
+        @function.Defun(*[dtypes.float32] * 3)
+        def TestBinary(n, x, x2):
+          return math_ops.add(n, 1), x + n + v, x2 + v
+
+        with self.session(graph=g, use_gpu=use_gpu) as sess:
+          result_unary = functional_ops.While(
+              [1.0, 0.],
+              function.Defun(*[dtypes.float32] * 2)(TestCond), TestUnary)
+          result_binary = functional_ops.While(
+              [1.0, 0., 0.],
+              function.Defun(*[dtypes.float32] * 3)(TestCond), TestBinary)
+          sess.run(variables.global_variables_initializer())
+          assert len(result_unary) == 2
+          self.assertEqual([10.0, 54.0], sess.run(result_unary))
+          assert len(result_binary) == 3
+          self.assertEqual([10.0, 54.0, 9.0], sess.run(result_binary))
+
+          def TestCondCapture(n, *args):
+            del args
+            return math_ops.to_float(n) + v < 10
+
+          with self.assertRaises(ValueError):
+            _ = functional_ops.While(
+                [1],
+                function.Defun(dtypes.int32)(TestCondCapture),
+                function.Defun(dtypes.int32, dtypes.float32)(TestUnary))
+
+  # pylint: enable=cell-var-from-loop
+
   def _tfSum(self, use_gpu, rewrite_with_while):
     with ops.Graph().as_default() as g:
       with self.session(graph=g, use_gpu=use_gpu) as sess:
diff --git a/tensorflow/python/kernel_tests/list_ops_test.py b/tensorflow/python/kernel_tests/list_ops_test.py
index d57012dc860..92552854aa6 100644
--- a/tensorflow/python/kernel_tests/list_ops_test.py
+++ b/tensorflow/python/kernel_tests/list_ops_test.py
@@ -19,6 +19,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from absl.testing import parameterized
 import numpy as np  # pylint: disable=unused-import
 
 from tensorflow.python.client import session
@@ -42,36 +43,84 @@ def scalar_shape():
   return ops.convert_to_tensor([], dtype=dtypes.int32)
 
 
-class ListOpsTest(test_util.TensorFlowTestCase):
+@test_util.run_all_in_graph_and_eager_modes
+class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
-  @test_util.run_in_graph_and_eager_modes
-  def testPushPop(self):
-    l = list_ops.empty_tensor_list(element_dtype=dtypes.float32,
-                                   element_shape=scalar_shape())
+  def _testPushPop(self, max_num_elements):
+    l = list_ops.empty_tensor_list(
+        element_dtype=dtypes.float32,
+        element_shape=scalar_shape(),
+        max_num_elements=max_num_elements)
     l = list_ops.tensor_list_push_back(l, constant_op.constant(1.0))
     l, e = list_ops.tensor_list_pop_back(l, element_dtype=dtypes.float32)
     self.assertAllEqual(self.evaluate(e), 1.0)
 
-  @test_util.run_in_graph_and_eager_modes
-  def testPushPopGPU(self):
+  @parameterized.named_parameters(("NoMaxNumElements", None),
+                                  ("WithMaxNumElements", 2))
+  def testPushPop(self, max_num_elements):
+    self._testPushPop(max_num_elements)
+
+  @parameterized.named_parameters(("NoMaxNumElements", None),
+                                  ("WithMaxNumElements", 2))
+  def testPushPopGPU(self, max_num_elements):
     if not context.num_gpus():
       return
     with context.device("gpu:0"):
-      self.testPushPop()
+      self._testPushPop(max_num_elements)
 
-  @test_util.run_in_graph_and_eager_modes
-  def testStack(self):
-    l = list_ops.empty_tensor_list(element_dtype=dtypes.float32,
-                                   element_shape=scalar_shape())
+  def testPushInFullListFails(self):
+    l = list_ops.empty_tensor_list(
+        element_dtype=dtypes.float32,
+        element_shape=scalar_shape(),
+        max_num_elements=1)
+    l = list_ops.tensor_list_push_back(l, constant_op.constant(1.0))
+    with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                 "Tried to push item into a full list"):
+      l = list_ops.tensor_list_push_back(l, 2.)
+      self.evaluate(l)
+
+  @parameterized.named_parameters(("NoMaxNumElements", None),
+                                  ("WithMaxNumElements", 2))
+  def testPopFromEmptyTensorListFails(self, max_num_elements):
+    l = list_ops.empty_tensor_list(
+        element_dtype=dtypes.float32,
+        element_shape=scalar_shape(),
+        max_num_elements=max_num_elements)
+    with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                 "Trying to pop from an empty list"):
+      l = list_ops.tensor_list_pop_back(l, element_dtype=dtypes.float32)
+      self.evaluate(l)
+
+  def _testStack(self, max_num_elements):
+    l = list_ops.empty_tensor_list(
+        element_dtype=dtypes.float32,
+        element_shape=scalar_shape(),
+        max_num_elements=max_num_elements)
     l = list_ops.tensor_list_push_back(l, constant_op.constant(1.0))
     l = list_ops.tensor_list_push_back(l, constant_op.constant(2.0))
     t = list_ops.tensor_list_stack(l, element_dtype=dtypes.float32)
     self.assertAllEqual(self.evaluate(t), [1.0, 2.0])
 
-  @test_util.run_in_graph_and_eager_modes
-  def testStackWithUnknownElementShape(self):
+  @parameterized.named_parameters(("NoMaxNumElements", None),
+                                  ("WithMaxNumElements", 2))
+  def testStack(self, max_num_elements):
+    self._testStack(max_num_elements)
+
+  @parameterized.named_parameters(("NoMaxNumElements", None),
+                                  ("WithMaxNumElements", 2))
+  def testStackGPU(self, max_num_elements):
+    if not context.num_gpus():
+      return
+    with context.device("gpu:0"):
+      self._testStack(max_num_elements)
+
+  @parameterized.named_parameters(("NoMaxNumElements", None),
+                                  ("WithMaxNumElements", 3))
+  def testStackWithUnknownElementShape(self, max_num_elements):
     l = list_ops.empty_tensor_list(
-        element_dtype=dtypes.float32, element_shape=-1)
+        element_dtype=dtypes.float32,
+        element_shape=-1,
+        max_num_elements=max_num_elements)
     l = list_ops.tensor_list_push_back(l, constant_op.constant(1.0))
     l = list_ops.tensor_list_push_back(l, constant_op.constant(2.0))
 
@@ -85,10 +134,13 @@ class ListOpsTest(test_util.TensorFlowTestCase):
       t = list_ops.tensor_list_stack(l, element_dtype=dtypes.float32)
       self.evaluate(t)
 
-  @test_util.run_in_graph_and_eager_modes
-  def testStackWithPartiallyDefinedElementShape(self):
+  @parameterized.named_parameters(("NoMaxNumElements", None),
+                                  ("WithMaxNumElements", 3))
+  def testStackWithPartiallyDefinedElementShape(self, max_num_elements):
     l = list_ops.empty_tensor_list(
-        element_dtype=dtypes.float32, element_shape=[-1])
+        element_dtype=dtypes.float32,
+        element_shape=[-1],
+        max_num_elements=max_num_elements)
     l = list_ops.tensor_list_push_back(l, constant_op.constant([1.0]))
     l = list_ops.tensor_list_push_back(l, constant_op.constant([2.0]))
 
@@ -102,11 +154,14 @@ class ListOpsTest(test_util.TensorFlowTestCase):
       t = list_ops.tensor_list_stack(l, element_dtype=dtypes.float32)
       self.evaluate(t)
 
-  @test_util.run_in_graph_and_eager_modes
-  def testStackEmptyList(self):
+  @parameterized.named_parameters(("NoMaxNumElements", None),
+                                  ("WithMaxNumElements", 2))
+  def testStackEmptyList(self, max_num_elements):
     # Should be able to stack empty lists with fully defined element_shape.
     l = list_ops.empty_tensor_list(
-        element_dtype=dtypes.float32, element_shape=[1, 2])
+        element_dtype=dtypes.float32,
+        element_shape=[1, 2],
+        max_num_elements=max_num_elements)
     t = list_ops.tensor_list_stack(l, element_dtype=dtypes.float32)
     self.assertAllEqual(self.evaluate(t).shape, (0, 1, 2))
 
@@ -115,7 +170,9 @@ class ListOpsTest(test_util.TensorFlowTestCase):
     with self.assertRaisesRegexp(errors.InvalidArgumentError,
                                  "non-fully-defined"):
       l = list_ops.empty_tensor_list(
-          element_dtype=dtypes.float32, element_shape=[-1, 2])
+          element_dtype=dtypes.float32,
+          element_shape=[-1, 2],
+          max_num_elements=max_num_elements)
       t = list_ops.tensor_list_stack(l, element_dtype=dtypes.float32)
       self.evaluate(t)
 
@@ -123,15 +180,20 @@ class ListOpsTest(test_util.TensorFlowTestCase):
     with self.assertRaisesRegexp(errors.InvalidArgumentError,
                                  "non-fully-defined"):
       l = list_ops.empty_tensor_list(
-          element_dtype=dtypes.float32, element_shape=-1)
+          element_dtype=dtypes.float32,
+          element_shape=-1,
+          max_num_elements=max_num_elements)
       t = list_ops.tensor_list_stack(l, element_dtype=dtypes.float32)
       self.evaluate(t)
 
-  @test_util.run_in_graph_and_eager_modes
-  def testGatherGrad(self):
+  @parameterized.named_parameters(("NoMaxNumElements", None),
+                                  ("WithMaxNumElements", 2))
+  def testGatherGrad(self, max_num_elements):
     with backprop.GradientTape() as tape:
-      l = list_ops.empty_tensor_list(element_dtype=dtypes.float32,
-                                     element_shape=scalar_shape())
+      l = list_ops.empty_tensor_list(
+          element_dtype=dtypes.float32,
+          element_shape=scalar_shape(),
+          max_num_elements=max_num_elements)
       c0 = constant_op.constant(1.0)
       tape.watch(c0)
       l = list_ops.tensor_list_push_back(l, c0)
@@ -142,10 +204,13 @@ class ListOpsTest(test_util.TensorFlowTestCase):
     dt = tape.gradient(s, c0)
     self.assertAllEqual(self.evaluate(dt), 6.0)
 
-  @test_util.run_in_graph_and_eager_modes
-  def testGatherWithUnknownElementShape(self):
+  @parameterized.named_parameters(("NoMaxNumElements", None),
+                                  ("WithMaxNumElements", 3))
+  def testGatherWithUnknownElementShape(self, max_num_elements):
     l = list_ops.empty_tensor_list(
-        element_dtype=dtypes.float32, element_shape=-1)
+        element_dtype=dtypes.float32,
+        element_shape=-1,
+        max_num_elements=max_num_elements)
     l = list_ops.tensor_list_push_back(l, constant_op.constant(1.0))
     l = list_ops.tensor_list_push_back(l, constant_op.constant(2.0))
     l = list_ops.tensor_list_push_back(l, constant_op.constant([3.0, 4.0]))
@@ -162,10 +227,13 @@ class ListOpsTest(test_util.TensorFlowTestCase):
       t = list_ops.tensor_list_gather(l, [0, 2], element_dtype=dtypes.float32)
       self.evaluate(t)
 
-  @test_util.run_in_graph_and_eager_modes
-  def testGatherWithPartiallyDefinedElementShape(self):
+  @parameterized.named_parameters(("NoMaxNumElements", None),
+                                  ("WithMaxNumElements", 3))
+  def testGatherWithPartiallyDefinedElementShape(self, max_num_elements):
     l = list_ops.empty_tensor_list(
-        element_dtype=dtypes.float32, element_shape=[-1])
+        element_dtype=dtypes.float32,
+        element_shape=[-1],
+        max_num_elements=max_num_elements)
     l = list_ops.tensor_list_push_back(l, constant_op.constant([1.0]))
     l = list_ops.tensor_list_push_back(l, constant_op.constant([2.0, 3.0]))
     l = list_ops.tensor_list_push_back(l, constant_op.constant([4.0, 5.0]))
@@ -182,12 +250,15 @@ class ListOpsTest(test_util.TensorFlowTestCase):
       t = list_ops.tensor_list_gather(l, [0, 2], element_dtype=dtypes.float32)
       self.evaluate(t)
 
-  @test_util.run_in_graph_and_eager_modes
-  def testGatherEmptyList(self):
+  @parameterized.named_parameters(("NoMaxNumElements", None),
+                                  ("WithMaxNumElements", 3))
+  def testGatherEmptyList(self, max_num_elements):
     # Should be able to gather from empty lists with fully defined
     # element_shape.
     l = list_ops.empty_tensor_list(
-        element_dtype=dtypes.float32, element_shape=[1, 2])
+        element_dtype=dtypes.float32,
+        element_shape=[1, 2],
+        max_num_elements=max_num_elements)
     t = list_ops.tensor_list_gather(l, [], element_dtype=dtypes.float32)
     self.assertAllEqual((0, 1, 2), self.evaluate(t).shape)
 
@@ -196,7 +267,9 @@ class ListOpsTest(test_util.TensorFlowTestCase):
     with self.assertRaisesRegexp(errors.InvalidArgumentError,
                                  "non-fully-defined"):
       l = list_ops.empty_tensor_list(
-          element_dtype=dtypes.float32, element_shape=[-1, 2])
+          element_dtype=dtypes.float32,
+          element_shape=[-1, 2],
+          max_num_elements=max_num_elements)
       t = list_ops.tensor_list_gather(l, [], element_dtype=dtypes.float32)
       self.evaluate(t)
 
@@ -205,11 +278,12 @@ class ListOpsTest(test_util.TensorFlowTestCase):
     with self.assertRaisesRegexp(errors.InvalidArgumentError,
                                  "non-fully-defined"):
       l = list_ops.empty_tensor_list(
-          element_dtype=dtypes.float32, element_shape=-1)
+          element_dtype=dtypes.float32,
+          element_shape=-1,
+          max_num_elements=max_num_elements)
       t = list_ops.tensor_list_gather(l, [], element_dtype=dtypes.float32)
       self.evaluate(t)
 
-  @test_util.run_in_graph_and_eager_modes
   def testScatterGrad(self):
     with backprop.GradientTape() as tape:
       c0 = constant_op.constant([1.0, 2.0])
@@ -224,14 +298,6 @@ class ListOpsTest(test_util.TensorFlowTestCase):
     dt = tape.gradient(loss, c0)
     self.assertAllEqual(self.evaluate(dt), [2., 4.])
 
-  @test_util.run_in_graph_and_eager_modes
-  def testStackGPU(self):
-    if not context.num_gpus():
-      return
-    with context.device("gpu:0"):
-      self.testStack()
-
-  @test_util.run_in_graph_and_eager_modes
   def testTensorListFromTensor(self):
     t = constant_op.constant([1.0, 2.0])
     l = list_ops.tensor_list_from_tensor(t, element_shape=scalar_shape())
@@ -241,14 +307,12 @@ class ListOpsTest(test_util.TensorFlowTestCase):
     self.assertAllEqual(self.evaluate(e), 1.0)
     self.assertAllEqual(self.evaluate(list_ops.tensor_list_length(l)), 0)
 
-  @test_util.run_in_graph_and_eager_modes
   def testFromTensorGPU(self):
     if not context.num_gpus():
       return
     with context.device("gpu:0"):
       self.testTensorListFromTensor()
 
-  @test_util.run_in_graph_and_eager_modes
   def testGetSetItem(self):
     t = constant_op.constant([1.0, 2.0])
     l = list_ops.tensor_list_from_tensor(t, element_shape=scalar_shape())
@@ -258,14 +322,36 @@ class ListOpsTest(test_util.TensorFlowTestCase):
     t = list_ops.tensor_list_stack(l, element_dtype=dtypes.float32)
     self.assertAllEqual(self.evaluate(t), [3.0, 2.0])
 
-  @test_util.run_in_graph_and_eager_modes
   def testGetSetGPU(self):
     if not context.num_gpus():
       return
     with context.device("gpu:0"):
       self.testGetSetItem()
 
-  @test_util.run_in_graph_and_eager_modes
+  def testSetGetGrad(self):
+    with backprop.GradientTape() as tape:
+      t = constant_op.constant(5.)
+      tape.watch(t)
+      l = list_ops.tensor_list_reserve(
+          element_dtype=dtypes.float32,
+          element_shape=scalar_shape(),
+          num_elements=3)
+      l = list_ops.tensor_list_set_item(l, 1, 2. * t)
+      e = list_ops.tensor_list_get_item(l, 1, element_dtype=dtypes.float32)
+      self.assertAllEqual(self.evaluate(e), 10.0)
+    self.assertAllEqual(self.evaluate(tape.gradient(e, t)), 2.0)
+
+  def testSetOnEmptyListWithMaxNumElementsFails(self):
+    l = list_ops.empty_tensor_list(
+        element_dtype=dtypes.float32,
+        element_shape=scalar_shape(),
+        max_num_elements=3)
+    with self.assertRaisesRegexp(
+        errors.InvalidArgumentError,
+        "Trying to modify element 0 in a list with 0 elements."):
+      l = list_ops.tensor_list_set_item(l, 0, 1.)
+      self.evaluate(l)
+
   def testUnknownShape(self):
     l = list_ops.empty_tensor_list(
         element_dtype=dtypes.float32, element_shape=-1)
@@ -276,7 +362,6 @@ class ListOpsTest(test_util.TensorFlowTestCase):
     l, e = list_ops.tensor_list_pop_back(l, element_dtype=dtypes.float32)
     self.assertAllEqual(self.evaluate(e), 1.0)
 
-  @test_util.run_in_graph_and_eager_modes
   def testCPUGPUCopy(self):
     if not context.num_gpus():
       return
@@ -294,7 +379,6 @@ class ListOpsTest(test_util.TensorFlowTestCase):
             list_ops.tensor_list_pop_back(
                 l_cpu, element_dtype=dtypes.float32)[1]), 2.0)
 
-  @test_util.run_in_graph_and_eager_modes
   def testCPUGPUCopyNested(self):
     if not context.num_gpus():
       return
@@ -331,7 +415,7 @@ class ListOpsTest(test_util.TensorFlowTestCase):
               list_ops.tensor_list_stack(tl, element_dtype=dtypes.int32)),
           [[1]])
 
-  def testGraphStackInLoop(self):
+  def testSkipEagerStackInLoop(self):
     with self.cached_session():
       t1 = list_ops.empty_tensor_list(
           element_shape=constant_op.constant([], dtype=dtypes.int32),
@@ -348,7 +432,7 @@ class ListOpsTest(test_util.TensorFlowTestCase):
       s1 = list_ops.tensor_list_stack(t1, element_dtype=dtypes.int32)
       self.assertAllEqual(self.evaluate(s1), [0, 1, 2, 3])
 
-  def testGraphStackSwitchDtype(self):
+  def testSkipEagerStackSwitchDtype(self):
     with self.cached_session():
       list_ = list_ops.empty_tensor_list(
           element_shape=constant_op.constant([], dtype=dtypes.int32),
@@ -369,7 +453,7 @@ class ListOpsTest(test_util.TensorFlowTestCase):
       np_s1 = np.array([[1, 2, 3], [1, 2, 3]], dtype=np.float32)
       self.assertAllEqual(self.evaluate(s1), np_s1)
 
-  def testGraphStackInLoopSwitchDtype(self):
+  def testSkipEagerStackInLoopSwitchDtype(self):
     with self.cached_session():
       t1 = list_ops.empty_tensor_list(
           element_shape=constant_op.constant([], dtype=dtypes.int32),
@@ -392,7 +476,6 @@ class ListOpsTest(test_util.TensorFlowTestCase):
       np_s1 = np.vstack([np.arange(1, 4) * i for i in range(4)])
       self.assertAllEqual(self.evaluate(s1), np_s1)
 
-  @test_util.run_in_graph_and_eager_modes
   def testSerialize(self):
     worker = test_util.create_local_cluster(num_workers=1, num_ps=1)[0][0]
     with ops.Graph().as_default(), session.Session(target=worker.target):
@@ -407,7 +490,6 @@ class ListOpsTest(test_util.TensorFlowTestCase):
         worker_e = array_ops.identity(e)
       self.assertAllEqual(self.evaluate(worker_e), [2.0])
 
-  @test_util.run_in_graph_and_eager_modes
   def testSerializeListWithInvalidTensors(self):
     worker = test_util.create_local_cluster(num_workers=1, num_ps=1)[0][0]
     with ops.Graph().as_default(), session.Session(target=worker.target):
@@ -425,7 +507,6 @@ class ListOpsTest(test_util.TensorFlowTestCase):
         worker_t = array_ops.identity(t)
       self.assertAllEqual(self.evaluate(worker_t), [1.0, 2.0])
 
-  @test_util.run_in_graph_and_eager_modes
   def testSerializeListWithUnknownRank(self):
     worker = test_util.create_local_cluster(num_workers=1, num_ps=1)[0][0]
     with ops.Graph().as_default(), session.Session(target=worker.target):
@@ -440,7 +521,26 @@ class ListOpsTest(test_util.TensorFlowTestCase):
         element_shape = array_ops.identity(element_shape)
       self.assertEqual(self.evaluate(element_shape), -1)
 
-  @test_util.run_in_graph_and_eager_modes
+  def testSerializeListWithMaxNumElements(self):
+    if context.num_gpus():
+      # TODO(b/119151861): Enable on GPU.
+      return
+    worker = test_util.create_local_cluster(num_workers=1, num_ps=1)[0][0]
+    with ops.Graph().as_default(), session.Session(target=worker.target):
+      with ops.device("/job:worker"):
+        l = list_ops.empty_tensor_list(
+            element_shape=-1, element_dtype=dtypes.float32, max_num_elements=2)
+        l = list_ops.tensor_list_push_back(l, 1.)
+      with ops.device("/job:ps"):
+        l_ps = array_ops.identity(l)
+        l_ps = list_ops.tensor_list_push_back(l_ps, 2.)
+      with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                   "Tried to push item into a full list"):
+        with ops.device("/job:worker"):
+          l_worker = array_ops.identity(l_ps)
+          l_worker = list_ops.tensor_list_push_back(l_worker, 3.0)
+          self.evaluate(l_worker)
+
   def testPushPopGradients(self):
     with backprop.GradientTape() as tape:
       l = list_ops.empty_tensor_list(element_dtype=dtypes.float32,
@@ -452,7 +552,6 @@ class ListOpsTest(test_util.TensorFlowTestCase):
       e = 2 * e
     self.assertAllEqual(self.evaluate(tape.gradient(e, [c])[0]), 2.0)
 
-  @test_util.run_in_graph_and_eager_modes
   def testStackFromTensorGradients(self):
     with backprop.GradientTape() as tape:
       c = constant_op.constant([1.0, 2.0])
@@ -464,7 +563,6 @@ class ListOpsTest(test_util.TensorFlowTestCase):
     grad = tape.gradient(result, [c])[0]
     self.assertAllEqual(self.evaluate(grad), [2.0, 2.0])
 
-  @test_util.run_in_graph_and_eager_modes
   def testGetSetGradients(self):
     with backprop.GradientTape() as tape:
       c = constant_op.constant([1.0, 2.0])
@@ -480,14 +578,13 @@ class ListOpsTest(test_util.TensorFlowTestCase):
     self.assertAllEqual(self.evaluate(grad_c), [0.0, 4.0])
     self.assertAllEqual(self.evaluate(grad_c2), 6.0)
 
-  @test_util.run_in_graph_and_eager_modes
   def testSetOutOfBounds(self):
     c = constant_op.constant([1.0, 2.0])
     l = list_ops.tensor_list_from_tensor(c, element_shape=scalar_shape())
     with self.assertRaises(errors.InvalidArgumentError):
       self.evaluate(list_ops.tensor_list_set_item(l, 20, 3.0))
 
-  def testSetItemWithMismatchedShapeFails(self):
+  def testSkipEagerSetItemWithMismatchedShapeFails(self):
     with self.cached_session() as sess:
       ph = array_ops.placeholder(dtypes.float32)
       c = constant_op.constant([1.0, 2.0])
@@ -500,7 +597,6 @@ class ListOpsTest(test_util.TensorFlowTestCase):
                                    "incompatible shape"):
         sess.run(l_0, {ph: [3.0]})
 
-  @test_util.run_in_graph_and_eager_modes
   def testResourceVariableScatterGather(self):
     c = constant_op.constant([1.0, 2.0], dtype=dtypes.float32)
     l = list_ops.tensor_list_from_tensor(c, element_shape=scalar_shape())
@@ -524,7 +620,6 @@ class ListOpsTest(test_util.TensorFlowTestCase):
                 [[1.0, 2.0]] * 4)
     self.assertAllEqual(self.evaluate(updated_v_stacked), expected)
 
-  @test_util.run_in_graph_and_eager_modes
   def testConcat(self):
     c = constant_op.constant([1.0, 2.0], dtype=dtypes.float32)
     l0 = list_ops.tensor_list_from_tensor(c, element_shape=scalar_shape())
@@ -584,7 +679,6 @@ class ListOpsTest(test_util.TensorFlowTestCase):
           list_ops.tensor_list_concat_lists(l_batch_0, l_batch_of_int_tls,
                                             element_dtype=dtypes.float32))
 
-  @test_util.run_in_graph_and_eager_modes
   def testPushBackBatch(self):
     c = constant_op.constant([1.0, 2.0], dtype=dtypes.float32)
     l0 = list_ops.tensor_list_from_tensor(c, element_shape=scalar_shape())
@@ -626,7 +720,6 @@ class ListOpsTest(test_util.TensorFlowTestCase):
                                  "Invalid data type at index 0"):
       self.evaluate(list_ops.tensor_list_push_back_batch(l_batch, [3, 4]))
 
-  @test_util.run_in_graph_and_eager_modes
   def testZerosLike(self):
     for dtype in (dtypes.uint8, dtypes.uint16, dtypes.int8, dtypes.int16,
                   dtypes.int32, dtypes.int64, dtypes.float16, dtypes.float32,
@@ -651,7 +744,6 @@ class ListOpsTest(test_util.TensorFlowTestCase):
           self.evaluate(t_full_zeros), np.zeros(
               (2,), dtype=dtype.as_numpy_dtype))
 
-  @test_util.run_in_graph_and_eager_modes
   def testZerosLikeNested(self):
     for dtype in (dtypes.uint8, dtypes.uint16, dtypes.int8, dtypes.int16,
                   dtypes.int32, dtypes.int64, dtypes.float16, dtypes.float32,
@@ -692,7 +784,6 @@ class ListOpsTest(test_util.TensorFlowTestCase):
       self.assertAllEqual(
           self.evaluate(outputs[0]), np.zeros((2,), dtype=dtype.as_numpy_dtype))
 
-  @test_util.run_in_graph_and_eager_modes
   def testElementShape(self):
     l = list_ops.empty_tensor_list(
         element_dtype=dtypes.float32, element_shape=-1)
diff --git a/tensorflow/python/kernel_tests/matmul_op_test.py b/tensorflow/python/kernel_tests/matmul_op_test.py
index 4760236ca0e..1c2822180ac 100644
--- a/tensorflow/python/kernel_tests/matmul_op_test.py
+++ b/tensorflow/python/kernel_tests/matmul_op_test.py
@@ -35,6 +35,19 @@ from tensorflow.python.platform import test as test_lib
 # os.environ["TF_MATMUL_AUTOTUNE_ENABLE"] = "1" to enable it.
 
 
+class MatVecTest(test_lib.TestCase):
+  """Simple test for matvec, which is sugar on top of matmul."""
+
+  def testTwoByTwoCase(self):
+    a = np.array([[1, 2], [3, 4]])
+    b = np.array([5, 6])
+    with self.cached_session():
+      c = math_ops.matvec(a, b)
+      self.assertAllEqual((2,), c.shape)
+      c_ = c.eval()
+    self.assertAllEqual([5 + 2 * 6, 3 * 5 + 4 * 6], c_)
+
+
 def _AddTest(test, op_name, testcase_name, fn):
   test_name = "_".join(["test", op_name, testcase_name])
   if hasattr(test, test_name):
diff --git a/tensorflow/python/kernel_tests/rnn_test.py b/tensorflow/python/kernel_tests/rnn_test.py
index 2fb49638ef5..0090b7332f9 100644
--- a/tensorflow/python/kernel_tests/rnn_test.py
+++ b/tensorflow/python/kernel_tests/rnn_test.py
@@ -54,6 +54,7 @@ import tensorflow.python.ops.tensor_array_grad  # pylint: disable=unused-import
 from tensorflow.python.platform import test
 from tensorflow.python.training import saver
 from tensorflow.python.training import training
+from tensorflow.python.util import nest
 
 
 class Plus1RNNCell(rnn_cell_impl.RNNCell):
@@ -471,6 +472,8 @@ class RNNTest(test.TestCase):
       outputs, state = rnn.dynamic_rnn(
           cell, inputs, dtype=dtypes.float32)
       self.assertEqual(outputs.shape.as_list(), [None, timestep, output_shape])
+      self.assertEqual(len(state), 2)
+      state = nest.flatten(state)
       self.assertEqual(len(state), 4)
       self.assertEqual(state[0].shape.as_list(), [None, 2 * output_shape])
       self.assertEqual(state[1].shape.as_list(), [None, 2 * output_shape])
@@ -664,24 +667,25 @@ class RNNTest(test.TestCase):
       kn1 = KerasNetworkTFRNNs(name="kn1")
       kn2 = KerasNetworkKerasRNNs(name="kn2")
 
-      z = array_ops.zeros((2, 3))
+    z = array_ops.zeros((2, 3))
 
-      kn1(z)
-      kn2(z)
+    kn1(z)
+    kn2(z)
 
-      # pylint: disable=protected-access
-      self.assertTrue(all("kn1" in v.name for v in kn1._cell.variables))
-      self.assertTrue(all("kn2" in v.name for v in kn2._cell.variables))
+    # pylint: disable=protected-access
+    self.assertTrue(all("kn1" in v.name for v in kn1._cell.variables))
+    self.assertTrue(all("kn2" in v.name for v in kn2._cell.variables))
 
+    with base_layers.keras_style_scope():
       kn1_new = KerasNetworkTFRNNs(name="kn1_new")
       kn2_new = KerasNetworkKerasRNNs(name="kn2_new")
 
-      kn2_new(z)
-      # Most importantly, this doesn't fail due to variable scope reuse issues.
-      kn1_new(z)
+    kn2_new(z)
+    # Most importantly, this doesn't fail due to variable scope reuse issues.
+    kn1_new(z)
 
-      self.assertTrue(all("kn1_new" in v.name for v in kn1_new._cell.variables))
-      self.assertTrue(all("kn2_new" in v.name for v in kn2_new._cell.variables))
+    self.assertTrue(all("kn1_new" in v.name for v in kn1_new._cell.variables))
+    self.assertTrue(all("kn2_new" in v.name for v in kn2_new._cell.variables))
 
 
 ######### Benchmarking RNN code
diff --git a/tensorflow/python/kernel_tests/signal/BUILD b/tensorflow/python/kernel_tests/signal/BUILD
new file mode 100644
index 00000000000..56d4d46d462
--- /dev/null
+++ b/tensorflow/python/kernel_tests/signal/BUILD
@@ -0,0 +1,114 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+load("//tensorflow:tensorflow.bzl", "cuda_py_tests")
+load("//tensorflow:tensorflow.bzl", "py_test")  # @unused
+
+py_library(
+    name = "test_util",
+    srcs = ["test_util.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:tf_optimizer",
+        "//tensorflow/python:training",
+    ],
+)
+
+cuda_py_tests(
+    name = "mel_ops_test",
+    srcs = ["mel_ops_test.py"],
+    additional_deps = [
+        ":test_util",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/ops/signal",
+    ],
+)
+
+cuda_py_tests(
+    name = "mfcc_ops_test",
+    srcs = ["mfcc_ops_test.py"],
+    additional_deps = [
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python/ops/signal",
+        "//tensorflow/python:spectral_ops_test_util",
+    ],
+)
+
+cuda_py_tests(
+    name = "reconstruction_ops_test",
+    srcs = ["reconstruction_ops_test.py"],
+    additional_deps = [
+        "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:gradients",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python/ops/signal",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
+cuda_py_tests(
+    name = "shape_ops_test",
+    srcs = ["shape_ops_test.py"],
+    additional_deps = [
+        ":test_util",
+        "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python/ops/signal",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
+cuda_py_tests(
+    name = "spectral_ops_test",
+    size = "large",
+    srcs = ["spectral_ops_test.py"],
+    additional_deps = [
+        "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:gradients",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:random_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python/ops/signal",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:spectral_ops_test_util",
+    ],
+    tags = ["nomac"],
+)
+
+cuda_py_tests(
+    name = "window_ops_test",
+    srcs = ["window_ops_test.py"],
+    additional_deps = [
+        ":test_util",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python/ops/signal",
+        "//tensorflow/python:platform_test",
+    ],
+)
diff --git a/tensorflow/contrib/signal/python/kernel_tests/mel_ops_test.py b/tensorflow/python/kernel_tests/signal/mel_ops_test.py
similarity index 98%
rename from tensorflow/contrib/signal/python/kernel_tests/mel_ops_test.py
rename to tensorflow/python/kernel_tests/signal/mel_ops_test.py
index 13ee8764b7c..1ed4429b42a 100644
--- a/tensorflow/contrib/signal/python/kernel_tests/mel_ops_test.py
+++ b/tensorflow/python/kernel_tests/signal/mel_ops_test.py
@@ -20,11 +20,11 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.signal.python.kernel_tests import test_util
-from tensorflow.contrib.signal.python.ops import mel_ops
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.kernel_tests.signal import test_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops.signal import mel_ops
 from tensorflow.python.platform import test
 
 # mel spectrum constants and functions.
diff --git a/tensorflow/contrib/signal/python/kernel_tests/mfcc_ops_test.py b/tensorflow/python/kernel_tests/signal/mfcc_ops_test.py
similarity index 97%
rename from tensorflow/contrib/signal/python/kernel_tests/mfcc_ops_test.py
rename to tensorflow/python/kernel_tests/signal/mfcc_ops_test.py
index 9de1e2c2f44..79d23d77d1e 100644
--- a/tensorflow/contrib/signal/python/kernel_tests/mfcc_ops_test.py
+++ b/tensorflow/python/kernel_tests/signal/mfcc_ops_test.py
@@ -18,12 +18,12 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.signal.python.ops import mfcc_ops
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import spectral_ops_test_util
+from tensorflow.python.ops.signal import mfcc_ops
 from tensorflow.python.platform import test
 
 
diff --git a/tensorflow/contrib/signal/python/kernel_tests/reconstruction_ops_test.py b/tensorflow/python/kernel_tests/signal/reconstruction_ops_test.py
similarity index 99%
rename from tensorflow/contrib/signal/python/kernel_tests/reconstruction_ops_test.py
rename to tensorflow/python/kernel_tests/signal/reconstruction_ops_test.py
index c476cd4e00d..c4e5b6f6740 100644
--- a/tensorflow/contrib/signal/python/kernel_tests/reconstruction_ops_test.py
+++ b/tensorflow/python/kernel_tests/signal/reconstruction_ops_test.py
@@ -20,12 +20,12 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.signal.python.ops import reconstruction_ops
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.signal import reconstruction_ops
 from tensorflow.python.platform import test
 
 
diff --git a/tensorflow/contrib/signal/python/kernel_tests/shape_ops_test.py b/tensorflow/python/kernel_tests/signal/shape_ops_test.py
similarity index 99%
rename from tensorflow/contrib/signal/python/kernel_tests/shape_ops_test.py
rename to tensorflow/python/kernel_tests/signal/shape_ops_test.py
index 838025a0406..398fba8b6df 100644
--- a/tensorflow/contrib/signal/python/kernel_tests/shape_ops_test.py
+++ b/tensorflow/python/kernel_tests/signal/shape_ops_test.py
@@ -20,13 +20,13 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.signal.python.kernel_tests import test_util
-from tensorflow.contrib.signal.python.ops import shape_ops
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.kernel_tests.signal import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.signal import shape_ops
 from tensorflow.python.platform import test
 
 
diff --git a/tensorflow/contrib/signal/python/kernel_tests/spectral_ops_test.py b/tensorflow/python/kernel_tests/signal/spectral_ops_test.py
similarity index 99%
rename from tensorflow/contrib/signal/python/kernel_tests/spectral_ops_test.py
rename to tensorflow/python/kernel_tests/signal/spectral_ops_test.py
index 5106a22f888..26cb1270639 100644
--- a/tensorflow/contrib/signal/python/kernel_tests/spectral_ops_test.py
+++ b/tensorflow/python/kernel_tests/signal/spectral_ops_test.py
@@ -20,14 +20,14 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.signal.python.ops import spectral_ops
-from tensorflow.contrib.signal.python.ops import window_ops
 from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import spectral_ops_test_util
+from tensorflow.python.ops.signal import spectral_ops
+from tensorflow.python.ops.signal import window_ops
 from tensorflow.python.platform import test
 
 
diff --git a/tensorflow/contrib/signal/python/kernel_tests/test_util.py b/tensorflow/python/kernel_tests/signal/test_util.py
similarity index 97%
rename from tensorflow/contrib/signal/python/kernel_tests/test_util.py
rename to tensorflow/python/kernel_tests/signal/test_util.py
index b4422a49887..f2c4d0dc8f8 100644
--- a/tensorflow/contrib/signal/python/kernel_tests/test_util.py
+++ b/tensorflow/python/kernel_tests/signal/test_util.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Test utilities for tf.contrib.signal."""
+"""Test utilities for tf.signal."""
 
 from __future__ import absolute_import
 from __future__ import division
diff --git a/tensorflow/contrib/signal/python/kernel_tests/window_ops_test.py b/tensorflow/python/kernel_tests/signal/window_ops_test.py
similarity index 96%
rename from tensorflow/contrib/signal/python/kernel_tests/window_ops_test.py
rename to tensorflow/python/kernel_tests/signal/window_ops_test.py
index 6a46a226938..2f19134f5a8 100644
--- a/tensorflow/contrib/signal/python/kernel_tests/window_ops_test.py
+++ b/tensorflow/python/kernel_tests/signal/window_ops_test.py
@@ -22,10 +22,10 @@ import functools
 
 import numpy as np
 
-from tensorflow.contrib.signal.python.kernel_tests import test_util
-from tensorflow.contrib.signal.python.ops import window_ops
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.kernel_tests.signal import test_util
+from tensorflow.python.ops.signal import window_ops
 from tensorflow.python.platform import test
 
 
diff --git a/tensorflow/python/kernel_tests/softmax_op_test.py b/tensorflow/python/kernel_tests/softmax_op_test.py
index 3218d00c668..ef9301d4e35 100644
--- a/tensorflow/python/kernel_tests/softmax_op_test.py
+++ b/tensorflow/python/kernel_tests/softmax_op_test.py
@@ -222,6 +222,13 @@ class SoftmaxTest(test.TestCase):
       with self.assertRaises(errors_impl.InvalidArgumentError):
         nn_ops.softmax([1., 2., 3., 4.], axis=dim).eval()
 
+  def testInvalidAxis(self):
+    # Test case for GitHub issue 22793.
+    with self.cached_session():
+      ones = array_ops.ones(shape=[2, 3])
+      with self.assertRaises(errors_impl.InvalidArgumentError):
+        nn_ops.softmax(ones, axis=2).eval()
+
   def testLargeDims(self):
     # Make sure that we properly handle large inputs. See
     # https://github.com/tensorflow/tensorflow/issues/4425 for details
diff --git a/tensorflow/python/kernel_tests/string_strip_op_test.py b/tensorflow/python/kernel_tests/string_strip_op_test.py
index a96b71490e4..1e404b71462 100644
--- a/tensorflow/python/kernel_tests/string_strip_op_test.py
+++ b/tensorflow/python/kernel_tests/string_strip_op_test.py
@@ -23,7 +23,7 @@ from tensorflow.python.platform import test
 
 
 class StringStripOpTest(test.TestCase):
-  """ Test cases for tf.string_strip."""
+  """ Test cases for tf.strings.strip."""
 
   def test_string_strip(self):
     strings = ["pigs on the wing", "animals"]
diff --git a/tensorflow/python/kernel_tests/summary_audio_op_test.py b/tensorflow/python/kernel_tests/summary_v1_audio_op_test.py
similarity index 96%
rename from tensorflow/python/kernel_tests/summary_audio_op_test.py
rename to tensorflow/python/kernel_tests/summary_v1_audio_op_test.py
index e59a2ceef7e..63ce77b9d55 100644
--- a/tensorflow/python/kernel_tests/summary_audio_op_test.py
+++ b/tensorflow/python/kernel_tests/summary_v1_audio_op_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for summary sound op."""
+"""Tests for summary V1 audio op."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -27,7 +27,7 @@ from tensorflow.python.platform import test
 from tensorflow.python.summary import summary
 
 
-class SummaryAudioOpTest(test.TestCase):
+class SummaryV1AudioOpTest(test.TestCase):
 
   def _AsSummary(self, s):
     summ = summary_pb2.Summary()
diff --git a/tensorflow/python/kernel_tests/summary_image_op_test.py b/tensorflow/python/kernel_tests/summary_v1_image_op_test.py
similarity index 98%
rename from tensorflow/python/kernel_tests/summary_image_op_test.py
rename to tensorflow/python/kernel_tests/summary_v1_image_op_test.py
index b650e104042..094606944ff 100644
--- a/tensorflow/python/kernel_tests/summary_image_op_test.py
+++ b/tensorflow/python/kernel_tests/summary_v1_image_op_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for summary image op."""
+"""Tests for summary V1 image op."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -30,7 +30,7 @@ from tensorflow.python.platform import test
 from tensorflow.python.summary import summary
 
 
-class SummaryImageOpTest(test.TestCase):
+class SummaryV1ImageOpTest(test.TestCase):
 
   def _AsSummary(self, s):
     summ = summary_pb2.Summary()
diff --git a/tensorflow/python/kernel_tests/summary_ops_test.py b/tensorflow/python/kernel_tests/summary_v1_ops_test.py
similarity index 89%
rename from tensorflow/python/kernel_tests/summary_ops_test.py
rename to tensorflow/python/kernel_tests/summary_v1_ops_test.py
index 0c500120b0b..6c4e106b118 100644
--- a/tensorflow/python/kernel_tests/summary_ops_test.py
+++ b/tensorflow/python/kernel_tests/summary_v1_ops_test.py
@@ -12,21 +12,26 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for summary ops."""
+"""Tests for the actual serialized proto output of the V1 tf.summary ops.
+
+The tensor, audio, and image ops have dedicated tests in adjacent files. The
+overall tf.summary API surface also has its own tests in summary_test.py that
+check calling the API methods but not the exact serialized proto output.
+"""
+
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
 from tensorflow.core.framework import summary_pb2
 from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import logging_ops
 from tensorflow.python.platform import test
 from tensorflow.python.summary import summary
 
 
-class SummaryOpsTest(test.TestCase):
+class SummaryV1OpsTest(test.TestCase):
 
   def _AsSummary(self, s):
     summ = summary_pb2.Summary()
@@ -100,13 +105,6 @@ class SummaryOpsTest(test.TestCase):
       self.assertEqual(summ2, merge.op.inputs[0])
       self.assertTrue(summary.merge_all("bar_key") is None)
 
-  def testHistogramSummaryTypes(self):
-    with ops.Graph().as_default():
-      for dtype in (dtypes.int8, dtypes.uint8, dtypes.int16, dtypes.int32,
-                    dtypes.float32, dtypes.float64):
-        const = constant_op.constant(10, dtype=dtype)
-        summary.histogram("h", const)
-
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/summary_tensor_op_test.py b/tensorflow/python/kernel_tests/summary_v1_tensor_op_test.py
similarity index 86%
rename from tensorflow/python/kernel_tests/summary_tensor_op_test.py
rename to tensorflow/python/kernel_tests/summary_v1_tensor_op_test.py
index 0f4643393a1..34f771679ae 100644
--- a/tensorflow/python/kernel_tests/summary_tensor_op_test.py
+++ b/tensorflow/python/kernel_tests/summary_v1_tensor_op_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for summary ops."""
+"""Tests for summary V1 tensor op."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -26,11 +26,11 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import summary_ops
 from tensorflow.python.platform import test
+from tensorflow.python.summary import summary as summary_lib
 
 
-class SummaryOpsTest(test.TestCase):
+class SummaryV1TensorOpTest(test.TestCase):
 
   def _SummarySingleValue(self, s):
     summ = summary_pb2.Summary()
@@ -44,12 +44,12 @@ class SummaryOpsTest(test.TestCase):
   def testTags(self):
     with self.cached_session() as sess:
       c = constant_op.constant(1)
-      s1 = summary_ops.tensor_summary("s1", c)
+      s1 = summary_lib.tensor_summary("s1", c)
       with ops.name_scope("foo"):
-        s2 = summary_ops.tensor_summary("s2", c)
+        s2 = summary_lib.tensor_summary("s2", c)
         with ops.name_scope("zod"):
-          s3 = summary_ops.tensor_summary("s3", c)
-          s4 = summary_ops.tensor_summary("TensorSummary", c)
+          s3 = summary_lib.tensor_summary("s3", c)
+          s4 = summary_lib.tensor_summary("TensorSummary", c)
       summ1, summ2, summ3, summ4 = sess.run([s1, s2, s3, s4])
 
     v1 = self._SummarySingleValue(summ1)
@@ -67,7 +67,7 @@ class SummaryOpsTest(test.TestCase):
   def testScalarSummary(self):
     with self.cached_session() as sess:
       const = constant_op.constant(10.0)
-      summ = summary_ops.tensor_summary("foo", const)
+      summ = summary_lib.tensor_summary("foo", const)
       result = sess.run(summ)
 
     value = self._SummarySingleValue(result)
@@ -78,7 +78,7 @@ class SummaryOpsTest(test.TestCase):
     s = six.b("foobar")
     with self.cached_session() as sess:
       const = constant_op.constant(s)
-      summ = summary_ops.tensor_summary("foo", const)
+      summ = summary_lib.tensor_summary("foo", const)
       result = sess.run(summ)
 
     value = self._SummarySingleValue(result)
@@ -88,7 +88,7 @@ class SummaryOpsTest(test.TestCase):
   def testManyScalarSummary(self):
     with self.cached_session() as sess:
       const = array_ops.ones([5, 5, 5])
-      summ = summary_ops.tensor_summary("foo", const)
+      summ = summary_lib.tensor_summary("foo", const)
       result = sess.run(summ)
     value = self._SummarySingleValue(result)
     n = tensor_util.MakeNdarray(value.tensor)
@@ -98,7 +98,7 @@ class SummaryOpsTest(test.TestCase):
     strings = [[six.b("foo bar"), six.b("baz")], [six.b("zoink"), six.b("zod")]]
     with self.cached_session() as sess:
       const = constant_op.constant(strings)
-      summ = summary_ops.tensor_summary("foo", const)
+      summ = summary_lib.tensor_summary("foo", const)
       result = sess.run(summ)
     value = self._SummarySingleValue(result)
     n = tensor_util.MakeNdarray(value.tensor)
@@ -108,7 +108,7 @@ class SummaryOpsTest(test.TestCase):
     bools = [True, True, True, False, False, False]
     with self.cached_session() as sess:
       const = constant_op.constant(bools)
-      summ = summary_ops.tensor_summary("foo", const)
+      summ = summary_lib.tensor_summary("foo", const)
       result = sess.run(summ)
 
     value = self._SummarySingleValue(result)
@@ -126,14 +126,14 @@ class SummaryOpsTest(test.TestCase):
 
       const = constant_op.constant(1)
       # Default case; no description or display name
-      simple_summary = summary_ops.tensor_summary("simple", const)
+      simple_summary = summary_lib.tensor_summary("simple", const)
 
       descr = get_description(simple_summary)
       self.assertEqual(descr.display_name, "")
       self.assertEqual(descr.summary_description, "")
 
       # Values are provided via function args
-      with_values = summary_ops.tensor_summary(
+      with_values = summary_lib.tensor_summary(
           "simple",
           const,
           display_name="my name",
@@ -148,14 +148,14 @@ class SummaryOpsTest(test.TestCase):
       metadata.display_name = "my name"
       metadata.summary_description = "my description"
 
-      with_metadata = summary_ops.tensor_summary(
+      with_metadata = summary_lib.tensor_summary(
           "simple", const, summary_metadata=metadata)
       descr = get_description(with_metadata)
       self.assertEqual(descr.display_name, "my name")
       self.assertEqual(descr.summary_description, "my description")
 
       # If both SummaryMetadata and explicit args are provided, the args win
-      overwrite = summary_ops.tensor_summary(
+      overwrite = summary_lib.tensor_summary(
           "simple",
           const,
           summary_metadata=metadata,
diff --git a/tensorflow/python/kernel_tests/tensor_array_ops_test.py b/tensorflow/python/kernel_tests/tensor_array_ops_test.py
index 91bd93712a9..0188eb246f0 100644
--- a/tensorflow/python/kernel_tests/tensor_array_ops_test.py
+++ b/tensorflow/python/kernel_tests/tensor_array_ops_test.py
@@ -1398,10 +1398,10 @@ class TensorArrayTest(test.TestCase):
     for d in dev_stats:
       if "/task:1/" in d:
         self.assertTrue(
-            [s for s in dev_stats[d] if "/TensorArray" in s.node_name])
+            [s for s in dev_stats[d] if "TensorArray" == s.node_name])
       else:
         self.assertFalse(
-            [s for s in dev_stats[d] if "/TensorArray" in s.node_name])
+            [s for s in dev_stats[d] if "TensorArray" == s.node_name])
 
   def testTensorArrayDisabledColocateWithFirstWriteCall(self):
     with ops.device("/job:worker/task:0/cpu:0"):
@@ -1428,10 +1428,10 @@ class TensorArrayTest(test.TestCase):
     for d in dev_stats:
       if "/task:0/" in d and "CPU" in d:  # Skip any GPU node stats
         self.assertTrue(
-            [s for s in dev_stats[d] if "/TensorArray" in s.node_name])
+            [s for s in dev_stats[d] if "TensorArray" == s.node_name])
       else:
         self.assertFalse(
-            [s for s in dev_stats[d] if "/TensorArray" in s.node_name])
+            [s for s in dev_stats[d] if "TensorArray" == s.node_name])
 
   @test_util.run_in_graph_and_eager_modes
   def testTensorArrayIdentity(self):
diff --git a/tensorflow/python/layers/base.py b/tensorflow/python/layers/base.py
index 24f6a098b35..fccea484b0f 100644
--- a/tensorflow/python/layers/base.py
+++ b/tensorflow/python/layers/base.py
@@ -208,6 +208,9 @@ class Layer(base_layer.Layer):
         raise ValueError(
             'reuse argument not allowed when keras style layers are enabled, '
             'but saw: {}'.format(self._reuse))
+      self._keras_style = True
+    else:
+      self._keras_style = False
 
     self._graph = None
     self._call_has_scope_arg = 'scope' in self._call_fn_args
@@ -275,7 +278,7 @@ class Layer(base_layer.Layer):
 
   def _name_scope(self):
     """Determines op naming for the Layer."""
-    if _is_in_keras_style_scope():
+    if self._keras_style:
       return super(Layer, self)._name_scope()
     return self._current_scope.original_name_scope
 
@@ -349,7 +352,7 @@ class Layer(base_layer.Layer):
       ValueError: When trainable has been set to True with synchronization
         set as `ON_READ`.
     """
-    if _is_in_keras_style_scope():
+    if self._keras_style:
       return super(Layer, self).add_weight(
           name=name,
           shape=shape,
@@ -477,7 +480,7 @@ class Layer(base_layer.Layer):
     """
     scope = kwargs.pop('scope', None)
 
-    if _is_in_keras_style_scope():
+    if self._keras_style:
       if scope is not None:
         raise ValueError(
             'scope argument not allowed when keras style layers are enabled, '
diff --git a/tensorflow/python/layers/base_test.py b/tensorflow/python/layers/base_test.py
index 7a0ed63a491..90abf35e875 100644
--- a/tensorflow/python/layers/base_test.py
+++ b/tensorflow/python/layers/base_test.py
@@ -76,11 +76,11 @@ class BaseLayerTest(test.TestCase):
     self.assertEqual(variable.name, 'my_layer/my_var:0')
 
     with base_layers.keras_style_scope():
-      with ops.name_scope('bar'):
-        layer = base_layers.Layer(name='my_layer')
-        # Test basic variable creation.
-        variable = layer.add_variable(
-            'my_var', [2, 2], initializer=init_ops.zeros_initializer())
+      layer = base_layers.Layer(name='my_layer')
+    # Test basic variable creation.
+    with ops.name_scope('bar'):
+      variable = layer.add_variable(
+          'my_var', [2, 2], initializer=init_ops.zeros_initializer())
     self.assertEqual(variable.name, 'bar/my_var:0')
 
   @test_util.run_in_graph_and_eager_modes
diff --git a/tensorflow/python/layers/convolutional.py b/tensorflow/python/layers/convolutional.py
index ce8ecae66d3..5d4805e245e 100644
--- a/tensorflow/python/layers/convolutional.py
+++ b/tensorflow/python/layers/convolutional.py
@@ -22,6 +22,7 @@ from __future__ import print_function
 from tensorflow.python.keras import layers as keras_layers
 from tensorflow.python.layers import base
 from tensorflow.python.ops import init_ops
+from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -114,6 +115,9 @@ class Conv1D(keras_layers.Conv1D, base.Layer):
         name=name, **kwargs)
 
 
+@deprecation.deprecated(
+    date=None,
+    instructions='Use keras.layers.conv1d instead.')
 @tf_export(v1=['layers.conv1d'])
 def conv1d(inputs,
            filters,
@@ -310,6 +314,9 @@ class Conv2D(keras_layers.Conv2D, base.Layer):
         name=name, **kwargs)
 
 
+@deprecation.deprecated(
+    date=None,
+    instructions='Use keras.layers.conv2d instead.')
 @tf_export(v1=['layers.conv2d'])
 def conv2d(inputs,
            filters,
@@ -514,6 +521,9 @@ class Conv3D(keras_layers.Conv3D, base.Layer):
         name=name, **kwargs)
 
 
+@deprecation.deprecated(
+    date=None,
+    instructions='Use keras.layers.conv3d instead.')
 @tf_export(v1=['layers.conv3d'])
 def conv3d(inputs,
            filters,
@@ -841,6 +851,9 @@ class SeparableConv2D(keras_layers.SeparableConv2D, base.Layer):
         **kwargs)
 
 
+@deprecation.deprecated(
+    date=None,
+    instructions='Use keras.layers.separable_conv1d instead.')
 @tf_export(v1=['layers.separable_conv1d'])
 def separable_conv1d(inputs,
                      filters,
@@ -958,6 +971,9 @@ def separable_conv1d(inputs,
   return layer.apply(inputs)
 
 
+@deprecation.deprecated(
+    date=None,
+    instructions='Use keras.layers.separable_conv2d instead.')
 @tf_export(v1=['layers.separable_conv2d'])
 def separable_conv2d(inputs,
                      filters,
@@ -1165,6 +1181,9 @@ class Conv2DTranspose(keras_layers.Conv2DTranspose, base.Layer):
         **kwargs)
 
 
+@deprecation.deprecated(
+    date=None,
+    instructions='Use keras.layers.conv2d_transpose instead.')
 @tf_export(v1=['layers.conv2d_transpose'])
 def conv2d_transpose(inputs,
                      filters,
@@ -1342,6 +1361,9 @@ class Conv3DTranspose(keras_layers.Conv3DTranspose, base.Layer):
         **kwargs)
 
 
+@deprecation.deprecated(
+    date=None,
+    instructions='Use keras.layers.conv3d_transpose instead.')
 @tf_export(v1=['layers.conv3d_transpose'])
 def conv3d_transpose(inputs,
                      filters,
diff --git a/tensorflow/python/layers/core.py b/tensorflow/python/layers/core.py
index 4eb243ab5d5..b2d54a98272 100644
--- a/tensorflow/python/layers/core.py
+++ b/tensorflow/python/layers/core.py
@@ -25,6 +25,7 @@ from __future__ import print_function
 from tensorflow.python.keras import layers as keras_layers
 from tensorflow.python.layers import base
 from tensorflow.python.ops import init_ops
+from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -109,6 +110,9 @@ class Dense(keras_layers.Dense, base.Layer):
                                 **kwargs)
 
 
+@deprecation.deprecated(
+    date=None,
+    instructions='Use keras.layers.dense instead.')
 @tf_export(v1=['layers.dense'])
 def dense(
     inputs, units,
@@ -223,6 +227,9 @@ class Dropout(keras_layers.Dropout, base.Layer):
     return super(Dropout, self).call(inputs, training=training)
 
 
+@deprecation.deprecated(
+    date=None,
+    instructions='Use keras.layers.dropout instead.')
 @tf_export(v1=['layers.dropout'])
 def dropout(inputs,
             rate=0.5,
@@ -291,6 +298,9 @@ class Flatten(keras_layers.Flatten, base.Layer):
   pass
 
 
+@deprecation.deprecated(
+    date=None,
+    instructions='Use keras.layers.flatten instead.')
 @tf_export(v1=['layers.flatten'])
 def flatten(inputs, name=None, data_format='channels_last'):
   """Flattens an input tensor while preserving the batch axis (axis 0).
diff --git a/tensorflow/python/layers/normalization.py b/tensorflow/python/layers/normalization.py
index ec6615ea866..7eefb294cd6 100644
--- a/tensorflow/python/layers/normalization.py
+++ b/tensorflow/python/layers/normalization.py
@@ -23,6 +23,7 @@ from __future__ import print_function
 from tensorflow.python.keras import layers as keras_layers
 from tensorflow.python.layers import base
 from tensorflow.python.ops import init_ops
+from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -154,6 +155,9 @@ class BatchNormalization(keras_layers.BatchNormalization, base.Layer):
     return super(BatchNormalization, self).call(inputs, training=training)
 
 
+@deprecation.deprecated(
+    date=None,
+    instructions='Use keras.layers.batch_normalization instead.')
 @tf_export(v1=['layers.batch_normalization'])
 def batch_normalization(inputs,
                         axis=-1,
diff --git a/tensorflow/python/layers/pooling.py b/tensorflow/python/layers/pooling.py
index 81e9ebd5477..d123afc6231 100644
--- a/tensorflow/python/layers/pooling.py
+++ b/tensorflow/python/layers/pooling.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 
 from tensorflow.python.keras import layers as keras_layers
 from tensorflow.python.layers import base
+from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -57,6 +58,9 @@ class AveragePooling1D(keras_layers.AveragePooling1D, base.Layer):
         **kwargs)
 
 
+@deprecation.deprecated(
+    date=None,
+    instructions='Use keras.layers.average_pooling1d instead.')
 @tf_export(v1=['layers.average_pooling1d'])
 def average_pooling1d(inputs, pool_size, strides,
                       padding='valid', data_format='channels_last',
@@ -125,6 +129,9 @@ class MaxPooling1D(keras_layers.MaxPooling1D, base.Layer):
         **kwargs)
 
 
+@deprecation.deprecated(
+    date=None,
+    instructions='Use keras.layers.max_pooling1d instead.')
 @tf_export(v1=['layers.max_pooling1d'])
 def max_pooling1d(inputs, pool_size, strides,
                   padding='valid', data_format='channels_last',
@@ -193,6 +200,9 @@ class AveragePooling2D(keras_layers.AveragePooling2D, base.Layer):
         padding=padding, data_format=data_format, name=name, **kwargs)
 
 
+@deprecation.deprecated(
+    date=None,
+    instructions='Use keras.layers.average_pooling2d instead.')
 @tf_export(v1=['layers.average_pooling2d'])
 def average_pooling2d(inputs,
                       pool_size, strides,
@@ -264,6 +274,9 @@ class MaxPooling2D(keras_layers.MaxPooling2D, base.Layer):
         padding=padding, data_format=data_format, name=name, **kwargs)
 
 
+@deprecation.deprecated(
+    date=None,
+    instructions='Use keras.layers.max_pooling2d instead.')
 @tf_export(v1=['layers.max_pooling2d'])
 def max_pooling2d(inputs,
                   pool_size, strides,
@@ -337,6 +350,9 @@ class AveragePooling3D(keras_layers.AveragePooling3D, base.Layer):
         padding=padding, data_format=data_format, name=name, **kwargs)
 
 
+@deprecation.deprecated(
+    date=None,
+    instructions='Use keras.layers.average_pooling3d instead.')
 @tf_export(v1=['layers.average_pooling3d'])
 def average_pooling3d(inputs,
                       pool_size, strides,
@@ -412,6 +428,9 @@ class MaxPooling3D(keras_layers.MaxPooling3D, base.Layer):
         padding=padding, data_format=data_format, name=name, **kwargs)
 
 
+@deprecation.deprecated(
+    date=None,
+    instructions='Use keras.layers.max_pooling3d instead.')
 @tf_export(v1=['layers.max_pooling3d'])
 def max_pooling3d(inputs,
                   pool_size, strides,
diff --git a/tensorflow/python/lib/core/py_func.cc b/tensorflow/python/lib/core/py_func.cc
index 6189503d8f5..9364aec373d 100644
--- a/tensorflow/python/lib/core/py_func.cc
+++ b/tensorflow/python/lib/core/py_func.cc
@@ -177,8 +177,7 @@ tensorflow::Status ExtractTensorFromEagerTensor(const PyObject* eager_tensor,
                                                 const Device* expected_device,
                                                 const Tensor** output_tensor) {
   auto handle = EagerTensor_Handle(eager_tensor)->handle;
-  Device* actual_device = nullptr;
-  TF_RETURN_IF_ERROR(handle->Device(&actual_device));
+  Device* actual_device = handle->device();
   TF_RETURN_IF_ERROR(handle->Tensor(output_tensor));
   // actual_device may be nullptr, which implies local CPU.
   if (expected_device == actual_device) return Status::OK();
diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index 232a77c8888..bbf7d166bf9 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -120,7 +120,7 @@ def expand_dims(input, axis=None, name=None, dim=None):
     axis: 0-D (scalar). Specifies the dimension index at which to
       expand the shape of `input`. Must be in the range
       `[-rank(input) - 1, rank(input)]`.
-    name: The name of the output `Tensor`.
+    name: The name of the output `Tensor` (optional).
     dim: 0-D (scalar). Equivalent to `axis`, to be deprecated.
 
   Returns:
@@ -128,9 +128,11 @@ def expand_dims(input, axis=None, name=None, dim=None):
     dimension of size 1 added.
 
   Raises:
-    ValueError: if both `dim` and `axis` are specified.
+    ValueError: if either both or neither of `dim` and `axis` are specified.
   """
   axis = deprecation.deprecated_argument_lookup("axis", axis, "dim", dim)
+  if axis is None:
+    raise ValueError("Must specify an axis argument to tf.expand_dims()")
   return gen_array_ops.expand_dims(input, axis, name)
 
 
@@ -153,7 +155,11 @@ listdiff.__doc__ = gen_array_ops.list_diff.__doc__ + "\n" + listdiff.__doc__
 
 
 # pylint: disable=undefined-variable
-@tf_export("setdiff1d")
+@deprecation.deprecated(
+    "2018-11-30",
+    "This op will be removed after the deprecation date. "
+    "Please switch to tf.sets.difference().")
+@tf_export(v1=["setdiff1d"])
 def setdiff1d(x, y, index_dtype=dtypes.int32, name=None):
   return gen_array_ops.list_diff(x, y, index_dtype, name)
 
@@ -163,7 +169,18 @@ setdiff1d.__doc__ = gen_array_ops.list_diff.__doc__
 
 @tf_export("broadcast_dynamic_shape")
 def broadcast_dynamic_shape(shape_x, shape_y):
-  """Returns the broadcasted dynamic shape between `shape_x` and `shape_y`.
+  """Computes the shape of a broadcast given symbolic shapes.
+
+  When shape_x and shape_y are Tensors representing shapes (i.e. the result of
+  calling tf.shape on another Tensor) this computes a Tensor which is the shape
+  of the result of a broadcasting op applied in tensors of shapes shape_x and
+  shape_y.
+
+  For example, if shape_x is [1, 2, 3] and shape_y is [5, 1, 3], the result is a
+  Tensor whose value is [5, 2, 3].
+
+  This is useful when validating the result of a broadcasting operation when the
+  tensors do not have statically known shapes.
 
   Args:
     shape_x: A rank 1 integer `Tensor`, representing the shape of x.
@@ -177,7 +194,17 @@ def broadcast_dynamic_shape(shape_x, shape_y):
 
 @tf_export("broadcast_static_shape")
 def broadcast_static_shape(shape_x, shape_y):
-  """Returns the broadcasted static shape between `shape_x` and `shape_y`.
+  """Computes the shape of a broadcast given known shapes.
+
+  When shape_x and shape_y are fully known TensorShapes this computes a
+  TensorShape which is the shape of the result of a broadcasting op applied in
+  tensors of shapes shape_x and shape_y.
+
+  For example, if shape_x is [1, 2, 3] and shape_y is [5, 1, 3], the result is a
+  TensorShape whose value is [5, 2, 3].
+
+  This is useful when validating the result of a broadcasting operation when the
+  tensors have statically known shapes.
 
   Args:
     shape_x: A `TensorShape`
@@ -389,6 +416,36 @@ def rank_internal(input, name=None, optimize=True):
       return gen_array_ops.rank(input, name=name)
 
 
+_SLICE_TYPE_ERROR = (
+    "Only integers, slices (`:`), ellipsis (`...`), "
+    "tf.newaxis (`None`) and scalar tf.int32/tf.int64 tensors are valid "
+    "indices")
+
+_SUPPORTED_SLICE_DTYPES = (
+    dtypes.int32,
+    dtypes.int32_ref,
+    dtypes.int64,
+    dtypes.int64_ref
+)
+
+
+def _check_index(idx):
+  """Check if a given value is a valid index into a tensor."""
+  if isinstance(idx, (int, tensor_shape.Dimension)):
+    return
+
+  # Optimistic check. Assumptions:
+  # * any object with a dtype is supported
+  # * any object with a dtype has a sizeable shape attribute.
+  dtype = getattr(idx, "dtype", None)
+  if (dtype is None or
+      dtypes.as_dtype(dtype) not in _SUPPORTED_SLICE_DTYPES or
+      idx.shape and len(idx.shape) == 1):
+    # TODO(slebedev): IndexError seems more appropriate here, but it
+    # will break `_slice_helper` contract.
+    raise TypeError(_SLICE_TYPE_ERROR + ", got {!r}".format(idx))
+
+
 def _slice_helper(tensor, slice_spec, var=None):
   """Overload for Tensor.__getitem__.
 
@@ -442,7 +499,8 @@ def _slice_helper(tensor, slice_spec, var=None):
 
   Raises:
     ValueError: If a slice range is negative size.
-    TypeError: If the slice indices aren't int, slice, or Ellipsis.
+    TypeError: If the slice indices aren't int, slice, ellipsis,
+      tf.newaxis or scalar int32/int64 tensors.
   """
 
   if not isinstance(slice_spec, (list, tuple)):
@@ -460,16 +518,19 @@ def _slice_helper(tensor, slice_spec, var=None):
       # for example a[:] gives slice(None,sys.maxsize,None)
       # whereas a[::1] gives slice(None,None,None)
       if s.start is not None and s.start is not sys.maxsize:
+        _check_index(s.start)
         begin.append(s.start)
       else:
         begin.append(0)
         begin_mask |= (1 << index)
       if s.stop is not None and s.stop != sys.maxsize:
+        _check_index(s.stop)
         end.append(s.stop)
       else:
         end.append(0)
         end_mask |= (1 << index)
       if s.step is not None:
+        _check_index(s.step)
         strides.append(s.step)
       else:
         strides.append(1)
@@ -484,6 +545,7 @@ def _slice_helper(tensor, slice_spec, var=None):
       strides.append(1)
       new_axis_mask |= (1 << index)
     else:
+      _check_index(s)
       begin.append(s)
       end.append(s + 1)
       strides.append(1)
@@ -753,7 +815,8 @@ def _SliceHelperVar(var, slice_spec):
 
   Raises:
     ValueError: If a slice range is negative size.
-    TypeError: If the slice indices aren't int, slice, or Ellipsis.
+    TypeError: TypeError: If the slice indices aren't int, slice,
+      ellipsis, tf.newaxis or int32/int64 tensors.
 
   """
 
@@ -1121,7 +1184,7 @@ def concat(values, axis, name="concat"):
   return gen_array_ops.concat_v2(values=values, axis=axis, name=name)
 
 
-@tf_export("boolean_mask")
+@tf_export(v1=["boolean_mask"])
 def boolean_mask(tensor, mask, name="boolean_mask", axis=None):
   """Apply boolean mask to tensor.  Numpy equivalent is `tensor[mask]`.
 
@@ -1201,6 +1264,54 @@ def boolean_mask(tensor, mask, name="boolean_mask", axis=None):
     return _apply_mask_1d(tensor, mask, axis)
 
 
+@tf_export("boolean_mask", v1=[])
+def boolean_mask_v2(tensor, mask, axis=None, name="boolean_mask"):
+  """Apply boolean mask to tensor.
+
+  Numpy equivalent is `tensor[mask]`.
+
+  ```python
+  # 1-D example
+  tensor = [0, 1, 2, 3]
+  mask = np.array([True, False, True, False])
+  boolean_mask(tensor, mask)  # [0, 2]
+  ```
+
+  In general, `0 < dim(mask) = K <= dim(tensor)`, and `mask`'s shape must match
+  the first K dimensions of `tensor`'s shape.  We then have:
+    `boolean_mask(tensor, mask)[i, j1,...,jd] = tensor[i1,...,iK,j1,...,jd]`
+  where `(i1,...,iK)` is the ith `True` entry of `mask` (row-major order).
+  The `axis` could be used with `mask` to indicate the axis to mask from.
+  In that case, `axis + dim(mask) <= dim(tensor)` and `mask`'s shape must match
+  the first `axis + dim(mask)` dimensions of `tensor`'s shape.
+
+  Args:
+    tensor:  N-D tensor.
+    mask:  K-D boolean tensor, K <= N and K must be known statically.
+    axis:  A 0-D int Tensor representing the axis in `tensor` to mask from. By
+      default, axis is 0 which will mask from the first dimension. Otherwise K +
+      axis <= N.
+    name:  A name for this operation (optional).
+
+  Returns:
+    (N-K+1)-dimensional tensor populated by entries in `tensor` corresponding
+    to `True` values in `mask`.
+
+  Raises:
+    ValueError:  If shapes do not conform.
+
+  Examples:
+
+  ```python
+  # 2-D example
+  tensor = [[1, 2], [3, 4], [5, 6]]
+  mask = np.array([True, False, True])
+  boolean_mask(tensor, mask)  # [[1, 2], [5, 6]]
+  ```
+  """
+  return boolean_mask(tensor, mask, name, axis)
+
+
 @tf_export("sparse.mask", v1=["sparse.mask", "sparse_mask"])
 @deprecation.deprecated_endpoints("sparse_mask")
 def sparse_mask(a, mask_indices, name=None):
@@ -1704,7 +1815,7 @@ def ones(shape, dtype=dtypes.float32, name=None):
   return output
 
 
-@tf_export("placeholder")
+@tf_export(v1=["placeholder"])
 def placeholder(dtype, shape=None, name=None):
   """Inserts a placeholder for a tensor that will be always fed.
 
@@ -1749,6 +1860,22 @@ def placeholder(dtype, shape=None, name=None):
   return gen_array_ops.placeholder(dtype=dtype, shape=shape, name=name)
 
 
+@tf_export(v1=["placeholder_with_default"])
+def placeholder_with_default(input, shape, name=None):  # pylint: disable=redefined-builtin
+  """A placeholder op that passes through `input` when its output is not fed.
+
+  Args:
+    input: A `Tensor`. The default value to produce when output is not fed.
+    shape: A `tf.TensorShape` or list of `int`s. The (possibly partial) shape
+      of the tensor.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor`. Has the same type as `input`.
+  """
+  return gen_array_ops.placeholder_with_default(input, shape, name)
+
+
 # pylint: disable=redefined-outer-name
 def _normalize_sparse_shape(shape, name):
   """Returns a tuple of (Tensor or None, rank or None)."""
@@ -1760,8 +1887,7 @@ def _normalize_sparse_shape(shape, name):
   return (ops.convert_to_tensor(shape, dtype=dtypes.int64, name=name), rank)
 
 
-@tf_export(
-    "sparse.placeholder", v1=["sparse.placeholder", "sparse_placeholder"])
+@tf_export(v1=["sparse.placeholder", "sparse_placeholder"])
 @deprecation.deprecated_endpoints("sparse_placeholder")
 def sparse_placeholder(dtype, shape=None, name=None):
   """Inserts a placeholder for a sparse tensor that will be always fed.
diff --git a/tensorflow/python/ops/boosted_trees_ops.py b/tensorflow/python/ops/boosted_trees_ops.py
index 720f9f4d41e..37d649acf00 100644
--- a/tensorflow/python/ops/boosted_trees_ops.py
+++ b/tensorflow/python/ops/boosted_trees_ops.py
@@ -33,13 +33,17 @@ from tensorflow.python.ops.gen_boosted_trees_ops import boosted_trees_make_quant
 from tensorflow.python.ops.gen_boosted_trees_ops import boosted_trees_make_stats_summary as make_stats_summary
 from tensorflow.python.ops.gen_boosted_trees_ops import boosted_trees_predict as predict
 from tensorflow.python.ops.gen_boosted_trees_ops import boosted_trees_quantile_stream_resource_add_summaries as quantile_add_summaries
+from tensorflow.python.ops.gen_boosted_trees_ops import boosted_trees_quantile_stream_resource_deserialize as quantile_resource_deserialize
 from tensorflow.python.ops.gen_boosted_trees_ops import boosted_trees_quantile_stream_resource_flush as quantile_flush
 from tensorflow.python.ops.gen_boosted_trees_ops import boosted_trees_quantile_stream_resource_get_bucket_boundaries as get_bucket_boundaries
+from tensorflow.python.ops.gen_boosted_trees_ops import boosted_trees_quantile_stream_resource_handle_op as quantile_resource_handle_op
 from tensorflow.python.ops.gen_boosted_trees_ops import boosted_trees_training_predict as training_predict
 from tensorflow.python.ops.gen_boosted_trees_ops import boosted_trees_update_ensemble as update_ensemble
+from tensorflow.python.ops.gen_boosted_trees_ops import is_boosted_trees_quantile_stream_resource_initialized as is_quantile_resource_initialized
 # pylint: enable=unused-import
 
 from tensorflow.python.training import saver
+from tensorflow.python.training.checkpointable import tracking
 
 
 class PruningMode(object):
@@ -57,6 +61,69 @@ class PruningMode(object):
           sorted(cls._map))))
 
 
+class QuantileAccumulator(saver.BaseSaverBuilder.SaveableObject):
+  """SaveableObject implementation for QuantileAccumulator.
+
+     The bucket boundaries are serialized and deserialized from checkpointing.
+  """
+
+  def __init__(self,
+               epsilon,
+               num_streams,
+               num_quantiles,
+               name=None,
+               max_elements=None):
+    with ops.name_scope(name, 'QuantileAccumulator') as name:
+      self._eps = epsilon
+      self._num_streams = num_streams
+      self._num_quantiles = num_quantiles
+      self._resource_handle = quantile_resource_handle_op(
+          container='', shared_name=name, name=name)
+      self._create_op = create_quantile_stream_resource(self._resource_handle,
+                                                        epsilon, num_streams)
+      is_initialized_op = is_quantile_resource_initialized(
+          self._resource_handle)
+      resources.register_resource(self._resource_handle, self._create_op,
+                                  is_initialized_op)
+      self._make_saveable(name)
+
+  def _make_saveable(self, name):
+    bucket_boundaries = get_bucket_boundaries(self._resource_handle,
+                                              self._num_streams)
+    slice_spec = ''
+    specs = []
+    for i in range(self._num_streams):
+      specs.append(
+          saver.BaseSaverBuilder.SaveSpec(
+              bucket_boundaries[i], slice_spec,
+              name + '_bucket_boundaries_' + str(i)))
+    super(QuantileAccumulator, self).__init__(self._resource_handle, specs,
+                                              name)
+    ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, self)
+
+  def restore(self, restored_tensors, unused_tensor_shapes):
+    bucket_boundaries = restored_tensors
+    with ops.control_dependencies([self._create_op]):
+      return quantile_resource_deserialize(
+          self._resource_handle, bucket_boundaries=bucket_boundaries)
+
+  def add_summaries(self, float_columns, example_weights):
+    summaries = make_quantile_summaries(float_columns, example_weights,
+                                        self._eps)
+    summary_op = quantile_add_summaries(self._resource_handle, summaries)
+    return summary_op
+
+  def flush(self):
+    return quantile_flush(self._resource_handle, self._num_quantiles)
+
+  def get_bucket_boundaries(self):
+    return get_bucket_boundaries(self._resource_handle, self._num_streams)
+
+  @property
+  def resource(self):
+    return self._resource_handle
+
+
 class _TreeEnsembleSavable(saver.BaseSaverBuilder.SaveableObject):
   """SaveableObject implementation for TreeEnsemble."""
 
@@ -102,35 +169,52 @@ class _TreeEnsembleSavable(saver.BaseSaverBuilder.SaveableObject):
           tree_ensemble_serialized=restored_tensors[1])
 
 
-class TreeEnsemble(object):
+class TreeEnsemble(tracking.TrackableResource):
   """Creates TreeEnsemble resource."""
 
   def __init__(self, name, stamp_token=0, is_local=False, serialized_proto=''):
+    self._stamp_token = stamp_token
+    self._serialized_proto = serialized_proto
+    self._is_local = is_local
     with ops.name_scope(name, 'TreeEnsemble') as name:
-      self._resource_handle = (
-          gen_boosted_trees_ops.boosted_trees_ensemble_resource_handle_op(
-              container='', shared_name=name, name=name))
-      create_op = gen_boosted_trees_ops.boosted_trees_create_ensemble(
-          self.resource_handle,
-          stamp_token,
-          tree_ensemble_serialized=serialized_proto)
-      is_initialized_op = (
-          gen_boosted_trees_ops.is_boosted_trees_ensemble_initialized(
-              self._resource_handle))
+      self._name = name
+      self._resource_handle = self.create_resource()
+      self._init_op = self.initialize()
+      is_initialized_op = self.is_initialized()
       # Adds the variable to the savable list.
       if not is_local:
-        saveable = _TreeEnsembleSavable(self.resource_handle, create_op,
-                                        self.resource_handle.name)
-        ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, saveable)
+        self._saveable = _TreeEnsembleSavable(
+            self.resource_handle, self.initializer, self.resource_handle.name)
+        ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, self._saveable)
       resources.register_resource(
           self.resource_handle,
-          create_op,
+          self.initializer,
           is_initialized_op,
           is_shared=not is_local)
 
+  def create_resource(self):
+    return gen_boosted_trees_ops.boosted_trees_ensemble_resource_handle_op(
+        container='', shared_name=self._name, name=self._name)
+
+  def initialize(self):
+    return gen_boosted_trees_ops.boosted_trees_create_ensemble(
+        self.resource_handle,
+        self._stamp_token,
+        tree_ensemble_serialized=self._serialized_proto)
+
   @property
-  def resource_handle(self):
-    return self._resource_handle
+  def initializer(self):
+    if self._init_op is None:
+      self._init_op = self.initialize()
+    return self._init_op
+
+  def is_initialized(self):
+    return gen_boosted_trees_ops.is_boosted_trees_ensemble_initialized(
+        self.resource_handle)
+
+  def _gather_saveables_for_checkpoint(self):
+    if not self._is_local:
+      return {'tree_ensemble': self._saveable}
 
   def get_stamp_token(self):
     """Returns the current stamp token of the resource."""
diff --git a/tensorflow/python/ops/control_flow_ops.py b/tensorflow/python/ops/control_flow_ops.py
index d74ab732d74..0d04f0697df 100644
--- a/tensorflow/python/ops/control_flow_ops.py
+++ b/tensorflow/python/ops/control_flow_ops.py
@@ -3616,12 +3616,22 @@ def _case_verify_and_canonicalize_args(pred_fn_pairs, exclusive, name,
   if isinstance(pred_fn_pairs, collections.OrderedDict):
     pred_fn_pairs = pred_fn_pairs.items()
   elif isinstance(pred_fn_pairs, dict):
-    pred_fn_pairs = sorted(pred_fn_pairs.items(), key=lambda item: item[0].name)
-    if not exclusive:
-      logging.warn(
-          "%s: An unordered dictionary of predicate/fn pairs was "
-          "provided, but exclusive=False. The order of conditional "
-          "tests is deterministic but not guaranteed.", name)
+    if context.executing_eagerly():
+      # No name to sort on in eager mode. Use dictionary traversal order,
+      # which is nondeterministic in versions of Python < 3.6
+      if not exclusive:
+        raise ValueError("Unordered dictionaries are not supported for the "
+                         "`pred_fn_pairs` argument when `exclusive=False` and "
+                         "eager mode is enabled.")
+      pred_fn_pairs = list(pred_fn_pairs.items())
+    else:
+      pred_fn_pairs = sorted(
+          pred_fn_pairs.items(), key=lambda item: item[0].name)
+      if not exclusive:
+        logging.warn(
+            "%s: An unordered dictionary of predicate/fn pairs was "
+            "provided, but exclusive=False. The order of conditional "
+            "tests is deterministic but not guaranteed.", name)
   for pred_fn_pair in pred_fn_pairs:
     if not isinstance(pred_fn_pair, _basetuple) or len(pred_fn_pair) != 2:
       raise TypeError("Each entry in pred_fn_pairs must be a 2-tuple")
@@ -3717,7 +3727,7 @@ def case(pred_fn_pairs,
   operation returns the tensors generated by `default`.
 
   `tf.case` supports nested structures as implemented in
-  `tensorflow.python.util.nest`. All of the callables must return the same
+  `tf.contrib.framework.nest`. All of the callables must return the same
   (possibly nested) value structure of lists, tuples, and/or named tuples.
   Singleton lists and tuples form the only exceptions to this: when returned by
   a callable, they are implicitly unpacked to single values. This
@@ -3728,6 +3738,12 @@ def case(pred_fn_pairs,
   deterministic, so that variables created in conditional branches are created
   in fixed order across runs.
 
+  @compatibility{eager}
+  Unordered dictionaries are not supported in eager mode when `exclusive=False`.
+  Use a list of tuples instead.
+  @end_compatibility
+
+
   **Example 1:**
 
   Pseudocode:
@@ -3742,7 +3758,7 @@ def case(pred_fn_pairs,
   ```python
   f1 = lambda: tf.constant(17)
   f2 = lambda: tf.constant(23)
-  r = case([(tf.less(x, y), f1)], default=f2)
+  r = tf.case([(tf.less(x, y), f1)], default=f2)
   ```
 
   **Example 2:**
@@ -3750,7 +3766,7 @@ def case(pred_fn_pairs,
   Pseudocode:
 
   ```
-  if (x < y && x > z) raise OpError("Only one predicate may evaluate true");
+  if (x < y && x > z) raise OpError("Only one predicate may evaluate to True");
   if (x < y) return 17;
   else if (x > z) return 23;
   else return -1;
@@ -3762,7 +3778,7 @@ def case(pred_fn_pairs,
   def f1(): return tf.constant(17)
   def f2(): return tf.constant(23)
   def f3(): return tf.constant(-1)
-  r = case({tf.less(x, y): f1, tf.greater(x, z): f2},
+  r = tf.case({tf.less(x, y): f1, tf.greater(x, z): f2},
            default=f3, exclusive=True)
   ```
 
diff --git a/tensorflow/python/ops/control_flow_ops_test.py b/tensorflow/python/ops/control_flow_ops_test.py
index f4b28f0113b..c3514c183c4 100644
--- a/tensorflow/python/ops/control_flow_ops_test.py
+++ b/tensorflow/python/ops/control_flow_ops_test.py
@@ -947,6 +947,16 @@ class CaseTest(test_util.TensorFlowTestCase):
       with self.assertRaisesRegexp(errors.InvalidArgumentError, "Input error:"):
         sess.run(output, feed_dict={x: 4})
 
+  @test_util.run_in_graph_and_eager_modes
+  def testCase_dict(self):
+    x = constant_op.constant(2)
+    conditions = {
+        math_ops.equal(x, 1): lambda: constant_op.constant(2),
+        math_ops.equal(x, 2): lambda: constant_op.constant(4)
+    }
+    output = control_flow_ops.case(conditions, exclusive=True)
+    self.assertEqual(4, self.evaluate(output))
+
 
 class WhileLoopTestCase(test_util.TensorFlowTestCase):
 
diff --git a/tensorflow/python/ops/custom_gradient.py b/tensorflow/python/ops/custom_gradient.py
index 3f215dbcd0c..1426e8851c5 100644
--- a/tensorflow/python/ops/custom_gradient.py
+++ b/tensorflow/python/ops/custom_gradient.py
@@ -131,7 +131,15 @@ def custom_gradient(f):
          a list of `Tensor`s - the derivatives of `Tensor`s in `y` with respect
          to the `Tensor`s in `x`.  `grad_ys` is a `Tensor` or sequence of
          `Tensor`s the same size as `y` holding the initial value gradients for
-         each `Tensor` in `y`. If `f` uses `Variable`s (that are not part of the
+         each `Tensor` in `y`. In a pure mathematical sense, a vector-argument
+         vector-valued function `f`'s derivatives should be its Jacobian matrix
+         `J`. Here we are expressing the Jacobian `J` as a function `grad_fn`
+         which defines how `J` will transform a vector `grad_ys` when
+         left-multiplied with it (`grad_ys * J`). This functional representation
+         of a matrix is convenient to use for chain-rule calculation
+         (in e.g. the back-propagation algorithm).
+
+         If `f` uses `Variable`s (that are not part of the
          inputs), i.e. through `get_variable`, then `grad_fn` should have
          signature `g(*grad_ys, variables=None)`, where `variables` is a list of
          the `Variable`s, and return a 2-tuple `(grad_xs, grad_vars)`, where
diff --git a/tensorflow/python/ops/distributions/bernoulli.py b/tensorflow/python/ops/distributions/bernoulli.py
index baecc321d38..4fb598aef4d 100644
--- a/tensorflow/python/ops/distributions/bernoulli.py
+++ b/tensorflow/python/ops/distributions/bernoulli.py
@@ -32,7 +32,7 @@ from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export("distributions.Bernoulli")
+@tf_export(v1=["distributions.Bernoulli"])
 class Bernoulli(distribution.Distribution):
   """Bernoulli distribution.
 
diff --git a/tensorflow/python/ops/distributions/beta.py b/tensorflow/python/ops/distributions/beta.py
index 51c4f6eb3d0..1d1a666317f 100644
--- a/tensorflow/python/ops/distributions/beta.py
+++ b/tensorflow/python/ops/distributions/beta.py
@@ -47,7 +47,7 @@ _beta_sample_note = """Note: `x` must have dtype `self.dtype` and be in
 `[0, 1].` It must have a shape compatible with `self.batch_shape()`."""
 
 
-@tf_export("distributions.Beta")
+@tf_export(v1=["distributions.Beta"])
 class Beta(distribution.Distribution):
   """Beta distribution.
 
diff --git a/tensorflow/python/ops/distributions/categorical.py b/tensorflow/python/ops/distributions/categorical.py
index 09d7e0e6804..33a84356250 100644
--- a/tensorflow/python/ops/distributions/categorical.py
+++ b/tensorflow/python/ops/distributions/categorical.py
@@ -59,7 +59,7 @@ def _broadcast_cat_event_and_params(event, params, base_dtype):
   return event, params
 
 
-@tf_export("distributions.Categorical")
+@tf_export(v1=["distributions.Categorical"])
 class Categorical(distribution.Distribution):
   """Categorical distribution.
 
diff --git a/tensorflow/python/ops/distributions/dirichlet.py b/tensorflow/python/ops/distributions/dirichlet.py
index 675c30b3833..971ce46efbc 100644
--- a/tensorflow/python/ops/distributions/dirichlet.py
+++ b/tensorflow/python/ops/distributions/dirichlet.py
@@ -45,7 +45,7 @@ dtype `self.dtype` and be in the `(self.event_shape() - 1)`-simplex, i.e.,
 `self.batch_shape() + self.event_shape()`."""
 
 
-@tf_export("distributions.Dirichlet")
+@tf_export(v1=["distributions.Dirichlet"])
 class Dirichlet(distribution.Distribution):
   """Dirichlet distribution.
 
diff --git a/tensorflow/python/ops/distributions/dirichlet_multinomial.py b/tensorflow/python/ops/distributions/dirichlet_multinomial.py
index 2e3151a5ab4..8ce01f6b957 100644
--- a/tensorflow/python/ops/distributions/dirichlet_multinomial.py
+++ b/tensorflow/python/ops/distributions/dirichlet_multinomial.py
@@ -51,7 +51,7 @@ fractional components, and such that
 with `self.concentration` and `self.total_count`."""
 
 
-@tf_export("distributions.DirichletMultinomial")
+@tf_export(v1=["distributions.DirichletMultinomial"])
 class DirichletMultinomial(distribution.Distribution):
   """Dirichlet-Multinomial compound distribution.
 
diff --git a/tensorflow/python/ops/distributions/distribution.py b/tensorflow/python/ops/distributions/distribution.py
index 11247a39bca..d551830fb84 100644
--- a/tensorflow/python/ops/distributions/distribution.py
+++ b/tensorflow/python/ops/distributions/distribution.py
@@ -212,7 +212,7 @@ class _DistributionMeta(abc.ABCMeta):
     return abc.ABCMeta.__new__(mcs, classname, baseclasses, attrs)
 
 
-@tf_export("distributions.ReparameterizationType")
+@tf_export(v1=["distributions.ReparameterizationType"])
 class ReparameterizationType(object):
   """Instances of this class represent how sampling is reparameterized.
 
@@ -263,7 +263,7 @@ class ReparameterizationType(object):
 # reparameterized distribution support straight-through gradients with
 # respect to all parameters.
 FULLY_REPARAMETERIZED = ReparameterizationType("FULLY_REPARAMETERIZED")
-tf_export("distributions.FULLY_REPARAMETERIZED").export_constant(
+tf_export(v1=["distributions.FULLY_REPARAMETERIZED"]).export_constant(
     __name__, "FULLY_REPARAMETERIZED")
 
 
@@ -271,12 +271,12 @@ tf_export("distributions.FULLY_REPARAMETERIZED").export_constant(
 # reparameterized distribution do not support straight-through gradients for
 # at least some of the parameters.
 NOT_REPARAMETERIZED = ReparameterizationType("NOT_REPARAMETERIZED")
-tf_export("distributions.NOT_REPARAMETERIZED").export_constant(
+tf_export(v1=["distributions.NOT_REPARAMETERIZED"]).export_constant(
     __name__, "NOT_REPARAMETERIZED")
 
 
 @six.add_metaclass(_DistributionMeta)
-@tf_export("distributions.Distribution")
+@tf_export(v1=["distributions.Distribution"])
 class Distribution(_BaseDistribution):
   """A generic probability distribution base class.
 
diff --git a/tensorflow/python/ops/distributions/exponential.py b/tensorflow/python/ops/distributions/exponential.py
index 6a52af8c33e..8b79a5d4abd 100644
--- a/tensorflow/python/ops/distributions/exponential.py
+++ b/tensorflow/python/ops/distributions/exponential.py
@@ -37,7 +37,7 @@ __all__ = [
 ]
 
 
-@tf_export("distributions.Exponential")
+@tf_export(v1=["distributions.Exponential"])
 class Exponential(gamma.Gamma):
   """Exponential distribution.
 
diff --git a/tensorflow/python/ops/distributions/gamma.py b/tensorflow/python/ops/distributions/gamma.py
index 4a2db208d40..57505d1b131 100644
--- a/tensorflow/python/ops/distributions/gamma.py
+++ b/tensorflow/python/ops/distributions/gamma.py
@@ -43,7 +43,7 @@ __all__ = [
 ]
 
 
-@tf_export("distributions.Gamma")
+@tf_export(v1=["distributions.Gamma"])
 class Gamma(distribution.Distribution):
   """Gamma distribution.
 
diff --git a/tensorflow/python/ops/distributions/kullback_leibler.py b/tensorflow/python/ops/distributions/kullback_leibler.py
index 12743fa23d6..5c6745b0fe0 100644
--- a/tensorflow/python/ops/distributions/kullback_leibler.py
+++ b/tensorflow/python/ops/distributions/kullback_leibler.py
@@ -60,7 +60,7 @@ def _registered_kl(type_a, type_b):
     "should update all references to use `tfp.distributions` "
     "instead of `tf.distributions`.",
     warn_once=True)
-@tf_export("distributions.kl_divergence")
+@tf_export(v1=["distributions.kl_divergence"])
 def kl_divergence(distribution_a, distribution_b,
                   allow_nan_stats=True, name=None):
   """Get the KL-divergence KL(distribution_a || distribution_b).
@@ -161,7 +161,7 @@ def cross_entropy(ref, other,
         ref, other, allow_nan_stats=allow_nan_stats)
 
 
-@tf_export("distributions.RegisterKL")
+@tf_export(v1=["distributions.RegisterKL"])
 class RegisterKL(object):
   """Decorator to register a KL divergence implementation function.
 
diff --git a/tensorflow/python/ops/distributions/laplace.py b/tensorflow/python/ops/distributions/laplace.py
index 4f6a8f587d1..a96b58ba1a6 100644
--- a/tensorflow/python/ops/distributions/laplace.py
+++ b/tensorflow/python/ops/distributions/laplace.py
@@ -43,7 +43,7 @@ __all__ = [
 ]
 
 
-@tf_export("distributions.Laplace")
+@tf_export(v1=["distributions.Laplace"])
 class Laplace(distribution.Distribution):
   """The Laplace distribution with location `loc` and `scale` parameters.
 
diff --git a/tensorflow/python/ops/distributions/multinomial.py b/tensorflow/python/ops/distributions/multinomial.py
index 8397353cd5e..97d2b1b26c6 100644
--- a/tensorflow/python/ops/distributions/multinomial.py
+++ b/tensorflow/python/ops/distributions/multinomial.py
@@ -52,7 +52,7 @@ fractional components, and such that
 with `self.probs` and `self.total_count`."""
 
 
-@tf_export("distributions.Multinomial")
+@tf_export(v1=["distributions.Multinomial"])
 class Multinomial(distribution.Distribution):
   """Multinomial distribution.
 
diff --git a/tensorflow/python/ops/distributions/normal.py b/tensorflow/python/ops/distributions/normal.py
index 9f511709b90..9acc0469885 100644
--- a/tensorflow/python/ops/distributions/normal.py
+++ b/tensorflow/python/ops/distributions/normal.py
@@ -42,7 +42,7 @@ __all__ = [
 ]
 
 
-@tf_export("distributions.Normal")
+@tf_export(v1=["distributions.Normal"])
 class Normal(distribution.Distribution):
   """The Normal distribution with location `loc` and `scale` parameters.
 
diff --git a/tensorflow/python/ops/distributions/student_t.py b/tensorflow/python/ops/distributions/student_t.py
index b69e61925c1..351f5605e24 100644
--- a/tensorflow/python/ops/distributions/student_t.py
+++ b/tensorflow/python/ops/distributions/student_t.py
@@ -43,7 +43,7 @@ __all__ = [
 ]
 
 
-@tf_export("distributions.StudentT")
+@tf_export(v1=["distributions.StudentT"])
 class StudentT(distribution.Distribution):
   """Student's t-distribution.
 
diff --git a/tensorflow/python/ops/distributions/uniform.py b/tensorflow/python/ops/distributions/uniform.py
index b6b24187cc5..8fac0167778 100644
--- a/tensorflow/python/ops/distributions/uniform.py
+++ b/tensorflow/python/ops/distributions/uniform.py
@@ -33,7 +33,7 @@ from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export("distributions.Uniform")
+@tf_export(v1=["distributions.Uniform"])
 class Uniform(distribution.Distribution):
   """Uniform distribution with `low` and `high` parameters.
 
diff --git a/tensorflow/python/ops/functional_ops.py b/tensorflow/python/ops/functional_ops.py
index f0ef03524bf..fecd7ddbf9f 100644
--- a/tensorflow/python/ops/functional_ops.py
+++ b/tensorflow/python/ops/functional_ops.py
@@ -802,6 +802,29 @@ def Gradient(inputs, f, name=None):
   return symbolic_gradient(input=inputs, Tout=tlist, f=f, name=name)
 
 
+def _LoopBodyCaptureWrapper(func):
+  """Returns a wrapper for `func` that handles loop-carried captured inputs."""
+
+  @function.Defun(
+      *func.declared_input_types, func_name="%s_Wrapper" % func.name)
+  def Wrapper(*args):
+    """A wrapper that handles loop-carried captured inputs."""
+    result = func(*args)
+    extra_args = tuple(function.get_extra_args())
+    # Nullary functions return an Operation. Normal functions can't do this
+    # because their return values are converted to Tensors.
+    if isinstance(result, ops.Operation):
+      return extra_args
+    # Unary functions return a single Tensor value.
+    elif not isinstance(result, tuple):
+      return (result,) + extra_args
+    # N-ary functions return a tuple of Tensors.
+    else:
+      return result + extra_args
+
+  return Wrapper
+
+
 # pylint: disable=invalid-name,protected-access
 def While(input_, cond, body, name=None, hostmem=None):
   r"""output = input; While (Cond(output)) { output = Body(output) }.
@@ -823,11 +846,41 @@ def While(input_, cond, body, name=None, hostmem=None):
     hostmem: A list of integer. If i is in the list, input[i] is a
       host memory tensor.
 
+  Raises:
+    ValueError: if `cond` has implicitly captured inputs or if `cond` and `body`
+      have different signatures.
+
   Returns:
     A list of `Tensor` objects. Has the same type as `input`.
     A list of output tensors whose types are T.
   """
-  ret = gen_functional_ops._while(input_, cond, body, name=name)
+  if cond.captured_inputs:
+    raise ValueError("While op 'cond' argument must be a function "
+                     "without implicitly captured inputs.")
+
+  if cond.declared_input_types != body.declared_input_types:
+    raise ValueError(
+        "While op 'cond' and 'body' signatures do not match. %r vs %r" %
+        (cond.declared_input_types, body.declared_input_types))
+
+  if body.captured_inputs:
+    cond_dtypes = list(
+        body.declared_input_types) + [t.dtype for t in body.captured_inputs]
+
+    @function.Defun(*cond_dtypes, func_name="%s_Wrapper" % cond.name)
+    def CondWrapper(*args):
+      """A wrapper that handles loop-carried captured inputs."""
+      return cond(*args[:len(body.declared_input_types)])
+
+    ret = gen_functional_ops._while(
+        input_ + body.captured_inputs,
+        CondWrapper,
+        _LoopBodyCaptureWrapper(body),
+        name=name)
+    # Slice off the loop-carried captured inputs.
+    ret = ret[:-len(body.captured_inputs)]
+  else:
+    ret = gen_functional_ops._while(input_, cond, body, name=name)
   if hostmem:
     input_attr = attr_value_pb2.AttrValue()
     input_attr.list.i.extend(hostmem)
@@ -876,11 +929,10 @@ def _ForUsingWhile(start,
   # must have identical inputs, we have to augment the cond signature to take
   # the same types as the carried loop variables.
   body_sig = [dtypes.int32] * 4 + list(forbody.declared_input_types)[1:]
-  cond_sig = body_sig + [t.dtype for t in forbody.captured_inputs]
 
   cond_name = "%s_Cond" % forbody.name
 
-  @function.Defun(*cond_sig, func_name=cond_name)
+  @function.Defun(*body_sig, func_name=cond_name)
   def WhileCond(i, n, *args):
     del args
     return i < n
@@ -898,8 +950,7 @@ def _ForUsingWhile(start,
     # Unary functions return a single Tensor value.
     elif isinstance(for_result, ops.Tensor):
       for_result = (for_result,)
-    extra_args = tuple(function.get_extra_args())
-    return (i + 1, n, start, delta) + tuple(for_result) + extra_args
+    return (i + 1, n, start, delta) + tuple(for_result)
 
   if hostmem is not None:
     hostmem = [0, 1, 2, 3] + [(4 + _) for _ in hostmem]
@@ -907,13 +958,13 @@ def _ForUsingWhile(start,
     hostmem = [0, 1, 2, 3]
 
   results = While(
-      input_=[0, n, start, delta] + inputs + WhileBody.captured_inputs,
+      input_=[0, n, start, delta] + inputs,
       cond=WhileCond,
       body=WhileBody,
       name=name,
       hostmem=hostmem)
   # Slice off the loop-carried captured inputs.
-  return list(results[4:len(results) - len(WhileBody.captured_inputs)])
+  return list(results[4:len(results)])
 
 
 def For(start,
@@ -947,29 +998,15 @@ def For(start,
   if rewrite_with_while:
     return _ForUsingWhile(start, limit, delta, inputs, body, name, hostmem)
   if body.captured_inputs:
-    wrapper_name = "%s_BodyWrapper" % body.name
-
-    @function.Defun(*body.declared_input_types, func_name=wrapper_name)
-    def BodyWrapper(*args):
-      """A wrapper for body that handles loop-carried captured inputs."""
-      body_result = body(*args)
-      extra_args = tuple(function.get_extra_args())
-      # Nullary functions return an Operation. Normal functions can't do this
-      # because their return values are converted to Tensors.
-      if isinstance(body_result, ops.Operation):
-        return extra_args
-      # Unary functions return a single Tensor value.
-      elif not isinstance(body_result, tuple):
-        return (body_result,) + extra_args
-      # N-ary functions return a tuple of Tensors.
-      else:
-        return body_result + extra_args
-
-    inputs += BodyWrapper.captured_inputs
     ret = gen_functional_ops._for(
-        start, limit, delta, inputs, BodyWrapper, name=name)
+        start,
+        limit,
+        delta,
+        inputs + body.captured_inputs,
+        _LoopBodyCaptureWrapper(body),
+        name=name)
     # Slice off the loop-carried captured inputs.
-    ret = ret[:-len(BodyWrapper.captured_inputs)]
+    ret = ret[:-len(body.captured_inputs)]
   else:
     ret = gen_functional_ops._for(start, limit, delta, inputs, body, name=name)
   if hostmem:
diff --git a/tensorflow/python/ops/linalg/linear_operator.py b/tensorflow/python/ops/linalg/linear_operator.py
index 704ac11d013..9ef6c42b04c 100644
--- a/tensorflow/python/ops/linalg/linear_operator.py
+++ b/tensorflow/python/ops/linalg/linear_operator.py
@@ -649,7 +649,7 @@ class LinearOperator(object):
         "  Requires conversion to a dense matrix and O(N^3) operations.")
     if self._can_use_cholesky():
       return math_ops.exp(self.log_abs_determinant())
-    return linalg_ops.matrix_determinant(self._matrix)
+    return linalg_ops.matrix_determinant(self.to_dense())
 
   def determinant(self, name="det"):
     """Determinant for every batch member.
@@ -677,7 +677,7 @@ class LinearOperator(object):
     if self._can_use_cholesky():
       diag = array_ops.matrix_diag_part(linalg_ops.cholesky(self.to_dense()))
       return 2 * math_ops.reduce_sum(math_ops.log(diag), reduction_indices=[-1])
-    _, log_abs_det = linalg.slogdet(self._matrix)
+    _, log_abs_det = linalg.slogdet(self.to_dense())
     return log_abs_det
 
   def log_abs_determinant(self, name="log_abs_det"):
diff --git a/tensorflow/python/ops/list_ops.py b/tensorflow/python/ops/list_ops.py
index 386626e6a95..b4a1fc6af61 100644
--- a/tensorflow/python/ops/list_ops.py
+++ b/tensorflow/python/ops/list_ops.py
@@ -33,6 +33,20 @@ ops.NotDifferentiable("TensorListConcat")
 ops.NotDifferentiable("TensorListPushBackBatch")
 
 
+def empty_tensor_list(element_shape,
+                      element_dtype,
+                      max_num_elements=None,
+                      name=None):
+  if max_num_elements is None:
+    max_num_elements = -1
+
+  return gen_list_ops.empty_tensor_list(
+      element_shape=element_shape,
+      element_dtype=element_dtype,
+      max_num_elements=max_num_elements,
+      name=name)
+
+
 @ops.RegisterGradient("TensorListPushBack")
 def _PushBackGrad(op, dresult):
   return gen_list_ops.tensor_list_pop_back(
@@ -42,7 +56,7 @@ def _PushBackGrad(op, dresult):
 @ops.RegisterGradient("TensorListPopBack")
 def _PopBackGrad(op, dlist, delement):
   if dlist is None:
-    dlist = gen_list_ops.empty_tensor_list(
+    dlist = empty_tensor_list(
         element_dtype=delement.dtype,
         element_shape=gen_list_ops.tensor_list_element_shape(
             op.outputs[0], shape_type=dtypes.int32))
@@ -63,7 +77,7 @@ def _TensorListFromTensorGrad(op, dlist):
   else:
     num_elements = None
   if dlist is None:
-    dlist = gen_list_ops.empty_tensor_list(
+    dlist = empty_tensor_list(
         element_dtype=op.inputs[0].dtype,
         element_shape=gen_list_ops.tensor_list_element_shape(
             op.outputs[0], shape_type=dtypes.int32))
diff --git a/tensorflow/python/ops/logging_ops.py b/tensorflow/python/ops/logging_ops.py
index fd532a9be2d..5a948a21946 100644
--- a/tensorflow/python/ops/logging_ops.py
+++ b/tensorflow/python/ops/logging_ops.py
@@ -629,4 +629,6 @@ ops.NotDifferentiable("AudioSummary")
 ops.NotDifferentiable("AudioSummaryV2")
 ops.NotDifferentiable("MergeSummary")
 ops.NotDifferentiable("ScalarSummary")
+ops.NotDifferentiable("TensorSummary")
+ops.NotDifferentiable("TensorSummaryV2")
 ops.NotDifferentiable("Timestamp")
diff --git a/tensorflow/python/ops/lookup_ops.py b/tensorflow/python/ops/lookup_ops.py
index 89109469b7b..397d56ef409 100644
--- a/tensorflow/python/ops/lookup_ops.py
+++ b/tensorflow/python/ops/lookup_ops.py
@@ -45,7 +45,7 @@ from tensorflow.python.util.deprecation import deprecated
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export("initialize_all_tables")
+@tf_export(v1=["initialize_all_tables"])
 @deprecated(None, "Use `tf.tables_initializer` instead.")
 def initialize_all_tables(name="init_all_tables"):
   """Returns an Op that initializes all tables of the default graph.
@@ -60,7 +60,7 @@ def initialize_all_tables(name="init_all_tables"):
   return tables_initializer(name)
 
 
-@tf_export("initializers.tables_initializer", "tables_initializer")
+@tf_export(v1=["initializers.tables_initializer", "tables_initializer"])
 def tables_initializer(name="init_all_tables"):
   """Returns an Op that initializes all tables of the default graph.
 
@@ -171,6 +171,11 @@ class InitializableLookupTableBase(LookupInterface):
   def initializer(self):
     return self._init_op
 
+  @property
+  @deprecated("2018-12-15", "Use `initializer` instead.")
+  def init(self):
+    return self.initializer
+
   @property
   def default_value(self):
     """The default value of the table."""
@@ -830,6 +835,11 @@ class IdTableWithHashBuckets(LookupInterface):
     with ops.name_scope(None, "init"):
       return control_flow_ops.no_op()
 
+  @property
+  @deprecated("2018-12-15", "Use `initializer` instead.")
+  def init(self):
+    return self.initializer
+
   @property
   def resource_handle(self):
     if self._table is not None:
diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py
index c9374006ba3..39b1ca8993e 100644
--- a/tensorflow/python/ops/math_ops.py
+++ b/tensorflow/python/ops/math_ops.py
@@ -50,8 +50,8 @@ from tensorflow.python.util.tf_export import tf_export
 # Aliases for some automatically-generated names.
 linspace = gen_math_ops.lin_space
 
-arg_max = deprecation.deprecated(None, "Use `argmax` instead")(arg_max)  # pylint: disable=used-before-assignment
-arg_min = deprecation.deprecated(None, "Use `argmin` instead")(arg_min)  # pylint: disable=used-before-assignment
+arg_max = deprecation.deprecated(None, "Use `tf.math.argmax` instead")(arg_max)  # pylint: disable=used-before-assignment
+arg_min = deprecation.deprecated(None, "Use `tf.math.argmin` instead")(arg_min)  # pylint: disable=used-before-assignment
 tf_export("arg_max")(arg_max)
 tf_export("arg_min")(arg_min)
 
@@ -70,7 +70,7 @@ def _set_doc(doc):
 
 
 # pylint: disable=redefined-builtin
-@tf_export("math.argmax", "argmax")
+@tf_export(v1=["math.argmax", "argmax"])
 @deprecation.deprecated_args(None, "Use the `axis` argument instead",
                              "dimension")
 @_set_doc(
@@ -85,10 +85,37 @@ def argmax(input,
       "axis", axis, "dimension", dimension)
   if axis is None:
     axis = 0
+  return argmax_v2(input, axis, output_type, name)
+
+
+@tf_export("math.argmax", "argmax", v1=[])
+def argmax_v2(input,
+              axis=None,
+              output_type=dtypes.int64,
+              name=None):
+  """Returns the index with the largest value across axes of a tensor.
+
+  Note that in case of ties the identity of the return value is not guaranteed.
+
+  Args:
+    input: A `Tensor`. Must be one of the following types: `float32`, `float64`,
+    `int32`, `uint8`, `int16`, `int8`, `complex64`, `int64`, `qint8`, `quint8`,
+    `qint32`, `bfloat16`, `uint16`, `complex128`, `half`, `uint32`, `uint64`.
+    axis: A `Tensor`. Must be one of the following types: `int32`, `int64`.
+      int32 or int64, must be in the range `-rank(input), rank(input))`.
+      Describes which axis of the input Tensor to reduce across. For vectors,
+      use axis = 0.
+    output_type: An optional `tf.DType` from: `tf.int32, tf.int64`.
+      Defaults to `tf.int64`.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor` of type `output_type`.
+  """
   return gen_math_ops.arg_max(input, axis, name=name, output_type=output_type)
 
 
-@tf_export("math.argmin", "argmin")
+@tf_export(v1=["math.argmin", "argmin"])
 @deprecation.deprecated_args(None, "Use the `axis` argument instead",
                              "dimension")
 @_set_doc(
@@ -103,6 +130,33 @@ def argmin(input,
       "axis", axis, "dimension", dimension)
   if axis is None:
     axis = 0
+  return argmin_v2(input, axis, output_type, name)
+
+
+@tf_export("math.argmin", "argmin", v1=[])
+def argmin_v2(input,
+              axis=None,
+              output_type=dtypes.int64,
+              name=None):
+  """Returns the index with the smallest value across axes of a tensor.
+
+  Note that in case of ties the identity of the return value is not guaranteed.
+
+  Args:
+    input: A `Tensor`. Must be one of the following types: `float32`, `float64`,
+    `int32`, `uint8`, `int16`, `int8`, `complex64`, `int64`, `qint8`, `quint8`,
+    `qint32`, `bfloat16`, `uint16`, `complex128`, `half`, `uint32`, `uint64`.
+    axis: A `Tensor`. Must be one of the following types: `int32`, `int64`.
+      int32 or int64, must be in the range `-rank(input), rank(input))`.
+      Describes which axis of the input Tensor to reduce across. For vectors,
+      use axis = 0.
+    output_type: An optional `tf.DType` from: `tf.int32, tf.int64`.
+      Defaults to `tf.int64`.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor` of type `output_type`.
+  """
   return gen_math_ops.arg_min(input, axis, name=name, output_type=output_type)
 
 
@@ -713,8 +767,8 @@ def saturate_cast(value, dtype, name=None):
                                        name="max"))
     return cast(value, dtype, name=name)
 
-
-@tf_export("to_float")
+@deprecation.deprecated(date=None, instructions="Use tf.cast instead.")
+@tf_export(v1=["to_float"])
 def to_float(x, name="ToFloat"):
   """Casts a tensor to type `float32`.
 
@@ -732,7 +786,8 @@ def to_float(x, name="ToFloat"):
   return cast(x, dtypes.float32, name=name)
 
 
-@tf_export("to_double")
+@deprecation.deprecated(date=None, instructions="Use tf.cast instead.")
+@tf_export(v1=["to_double"])
 def to_double(x, name="ToDouble"):
   """Casts a tensor to type `float64`.
 
@@ -750,7 +805,8 @@ def to_double(x, name="ToDouble"):
   return cast(x, dtypes.float64, name=name)
 
 
-@tf_export("to_int32")
+@deprecation.deprecated(date=None, instructions="Use tf.cast instead.")
+@tf_export(v1=["to_int32"])
 def to_int32(x, name="ToInt32"):
   """Casts a tensor to type `int32`.
 
@@ -768,7 +824,8 @@ def to_int32(x, name="ToInt32"):
   return cast(x, dtypes.int32, name=name)
 
 
-@tf_export("to_int64")
+@deprecation.deprecated(date=None, instructions="Use tf.cast instead.")
+@tf_export(v1=["to_int64"])
 def to_int64(x, name="ToInt64"):
   """Casts a tensor to type `int64`.
 
@@ -786,7 +843,8 @@ def to_int64(x, name="ToInt64"):
   return cast(x, dtypes.int64, name=name)
 
 
-@tf_export("to_bfloat16")
+@deprecation.deprecated(date=None, instructions="Use tf.cast instead.")
+@tf_export(v1=["to_bfloat16"])
 def to_bfloat16(x, name="ToBFloat16"):
   """Casts a tensor to type `bfloat16`.
 
@@ -804,7 +862,8 @@ def to_bfloat16(x, name="ToBFloat16"):
   return cast(x, dtypes.bfloat16, name=name)
 
 
-@tf_export("to_complex64")
+@deprecation.deprecated(date=None, instructions="Use tf.cast instead.")
+@tf_export(v1=["to_complex64"])
 def to_complex64(x, name="ToComplex64"):
   """Casts a tensor to type `complex64`.
 
@@ -822,7 +881,8 @@ def to_complex64(x, name="ToComplex64"):
   return cast(x, dtypes.complex64, name=name)
 
 
-@tf_export("to_complex128")
+@deprecation.deprecated(date=None, instructions="Use tf.cast instead.")
+@tf_export(v1=["to_complex128"])
 def to_complex128(x, name="ToComplex128"):
   """Casts a tensor to type `complex128`.
 
@@ -1495,6 +1555,100 @@ def reduce_mean(input_tensor,
                                    name=name))
 
 
+@tf_export("math.reduce_variance")
+def reduce_variance(input_tensor, axis=None, keepdims=None, name=None):
+  """Computes the variance of elements across dimensions of a tensor.
+
+  Reduces `input_tensor` along the dimensions given in `axis`.
+  Unless `keepdims` is true, the rank of the tensor is reduced by 1 for each
+  entry in `axis`. If `keepdims` is true, the reduced dimensions
+  are retained with length 1.
+
+  If `axis` is None, all dimensions are reduced, and a
+  tensor with a single element is returned.
+
+  For example:
+
+  ```python
+  x = tf.constant([[1., 2.], [3., 4.]])
+  tf.reduce_variance(x)  # 1.25
+  tf.reduce_variance(x, 0)  # [1., 1.]
+  tf.reduce_variance(x, 1)  # [0.25,  0.25]
+  ```
+
+  Args:
+    input_tensor: The tensor to reduce. Should have numeric type.
+    axis: The dimensions to reduce. If `None` (the default), reduces all
+      dimensions. Must be in the range `[-rank(input_tensor),
+      rank(input_tensor))`.
+    keepdims: If true, retains reduced dimensions with length 1.
+    name: A name scope for the associated operations (optional).
+
+  Returns:
+    The reduced tensor, of the same dtype as the input_tensor.
+
+  @compatibility(numpy)
+  Equivalent to np.var
+
+  Please note that `np.var` has a `dtype` parameter that could be used to
+  specify the output type. By default this is `dtype=float64`. On the other
+  hand, `tf.reduce_variance` has an aggressive type inference from
+  `input_tensor`,
+  @end_compatibility
+  """
+  name = name if name else "reduce_variance"
+  with ops.name_scope(name):
+    means = reduce_mean(input_tensor, axis=axis, keepdims=True)
+    squared_deviations = square(input_tensor - means)
+    return reduce_mean(squared_deviations, axis=axis, keepdims=keepdims)
+
+
+@tf_export("math.reduce_std")
+def reduce_std(input_tensor, axis=None, keepdims=None, name=None):
+  """Computes the standard deviation of elements across dimensions of a tensor.
+
+  Reduces `input_tensor` along the dimensions given in `axis`.
+  Unless `keepdims` is true, the rank of the tensor is reduced by 1 for each
+  entry in `axis`. If `keepdims` is true, the reduced dimensions
+  are retained with length 1.
+
+  If `axis` is None, all dimensions are reduced, and a
+  tensor with a single element is returned.
+
+  For example:
+
+  ```python
+  x = tf.constant([[1., 2.], [3., 4.]])
+  tf.reduce_std(x)  # 1.1180339887498949
+  tf.reduce_std(x, 0)  # [1., 1.]
+  tf.reduce_std(x, 1)  # [0.5,  0.5]
+  ```
+
+  Args:
+    input_tensor: The tensor to reduce. Should have numeric type.
+    axis: The dimensions to reduce. If `None` (the default), reduces all
+      dimensions. Must be in the range `[-rank(input_tensor),
+      rank(input_tensor))`.
+    keepdims: If true, retains reduced dimensions with length 1.
+    name: A name scope for the associated operations (optional).
+
+  Returns:
+    The reduced tensor, of the same dtype as the input_tensor.
+
+  @compatibility(numpy)
+  Equivalent to np.std
+
+  Please note that `np.std` has a `dtype` parameter that could be used to
+  specify the output type. By default this is `dtype=float64`. On the other
+  hand, `tf.reduce_std` has an aggressive type inference from `input_tensor`,
+  @end_compatibility
+  """
+  name = name if name else "reduce_std"
+  with ops.name_scope(name):
+    variance = reduce_variance(input_tensor, axis=axis, keepdims=keepdims)
+    return sqrt(variance)
+
+
 @tf_export("math.reduce_prod", "reduce_prod")
 @deprecation.deprecated_args(
     None, "keep_dims is deprecated, use keepdims instead", "keep_dims")
@@ -2057,10 +2211,108 @@ def matmul(a,
           a, b, transpose_a=transpose_a, transpose_b=transpose_b, name=name)
 
 
+@tf_export("linalg.matvec")
+def matvec(a,
+           b,
+           transpose_a=False,
+           adjoint_a=False,
+           a_is_sparse=False,
+           b_is_sparse=False,
+           name=None):
+  """Multiplies matrix `a` by vector `b`, producing `a` * `b`.
+
+  The matrix `a` must, following any transpositions, be a tensor of rank >= 2,
+  and we must have `shape(b) = shape(a)[:-2] + [shape(a)[-1]]`.
+
+  Both `a` and `b` must be of the same type. The supported types are:
+  `float16`, `float32`, `float64`, `int32`, `complex64`, `complex128`.
+
+  Matrix `a` can be transposed or adjointed (conjugated and transposed) on
+  the fly by setting one of the corresponding flag to `True`. These are `False`
+  by default.
+
+  If one or both of the inputs contain a lot of zeros, a more efficient
+  multiplication algorithm can be used by setting the corresponding
+  `a_is_sparse` or `b_is_sparse` flag to `True`. These are `False` by default.
+  This optimization is only available for plain matrices/vectors (rank-2/1
+  tensors) with datatypes `bfloat16` or `float32`.
+
+  For example:
+
+  ```python
+  # 2-D tensor `a`
+  # [[1, 2, 3],
+  #  [4, 5, 6]]
+  a = tf.constant([1, 2, 3, 4, 5, 6], shape=[2, 3])
+
+  # 1-D tensor `b`
+  # [7, 9, 11]
+  b = tf.constant([7, 9, 11], shape=[3])
+
+  # `a` * `b`
+  # [ 58,  64]
+  c = tf.matvec(a, b)
+
+
+  # 3-D tensor `a`
+  # [[[ 1,  2,  3],
+  #   [ 4,  5,  6]],
+  #  [[ 7,  8,  9],
+  #   [10, 11, 12]]]
+  a = tf.constant(np.arange(1, 13, dtype=np.int32),
+                  shape=[2, 2, 3])
+
+  # 2-D tensor `b`
+  # [[13, 14, 15],
+  #  [16, 17, 18]]
+  b = tf.constant(np.arange(13, 19, dtype=np.int32),
+                  shape=[2, 3])
+
+  # `a` * `b`
+  # [[ 86, 212],
+  #  [410, 563]]
+  c = tf.matvec(a, b)
+  ```
+
+  Args:
+    a: `Tensor` of type `float16`, `float32`, `float64`, `int32`, `complex64`,
+      `complex128` and rank > 1.
+    b: `Tensor` with same type and rank = `rank(a) - 1`.
+    transpose_a: If `True`, `a` is transposed before multiplication.
+    adjoint_a: If `True`, `a` is conjugated and transposed before
+      multiplication.
+    a_is_sparse: If `True`, `a` is treated as a sparse matrix.
+    b_is_sparse: If `True`, `b` is treated as a sparse matrix.
+    name: Name for the operation (optional).
+
+  Returns:
+    A `Tensor` of the same type as `a` and `b` where each inner-most vector is
+    the product of the corresponding matrices in `a` and vectors in `b`, e.g. if
+    all transpose or adjoint attributes are `False`:
+
+    `output`[..., i] = sum_k (`a`[..., i, k] * `b`[..., k]), for all indices i.
+
+    Note: This is matrix-vector product, not element-wise product.
+
+
+  Raises:
+    ValueError: If transpose_a and adjoint_a are both set to True.
+  """
+  with ops.name_scope(name, "MatVec", [a, b]) as name:
+    output = matmul(
+        a,
+        array_ops.expand_dims(b, axis=-1),
+        transpose_a=transpose_a,
+        adjoint_a=adjoint_a,
+        a_is_sparse=a_is_sparse,
+        b_is_sparse=b_is_sparse)
+    return array_ops.squeeze(output, axis=-1)
+
+
 _OverrideBinaryOperatorHelper(matmul, "matmul")
 
 sparse_matmul = gen_math_ops.sparse_mat_mul
-tf_export("sparse_matmul")(sparse_matmul)
+tf_export(v1=["sparse_matmul"])(sparse_matmul)
 
 
 @ops.RegisterStatistics("MatMul", "flops")
diff --git a/tensorflow/python/ops/math_ops_test.py b/tensorflow/python/ops/math_ops_test.py
index 10b87b3fccf..a4da0c6c339 100644
--- a/tensorflow/python/ops/math_ops_test.py
+++ b/tensorflow/python/ops/math_ops_test.py
@@ -69,6 +69,26 @@ class ReduceTest(test_util.TensorFlowTestCase):
     with self.assertRaisesRegexp(ValueError, "must be at most rank 1"):
       math_ops.reduce_sum(x, axis)
 
+  @test_util.run_in_graph_and_eager_modes
+  def testReduceVar(self):
+    x = np.array([[0, 0, 0], [0, 0, 0]], "float32")
+    self.assertAllClose(self.evaluate(math_ops.reduce_variance(x)), 0)
+    self.assertAllClose(
+        self.evaluate(math_ops.reduce_variance(x, axis=0)), [0, 0, 0])
+
+    x = np.array([[0, 2, 1, 1], [1, 2, 0, 1]], "float32")
+    self.assertAllClose(self.evaluate(math_ops.reduce_variance(x)), 0.5)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testReduceStd(self):
+    x = np.array([[0, 0, 0], [0, 0, 0]], "float32")
+    self.assertAllClose(self.evaluate(math_ops.reduce_std(x)), 0)
+    self.assertAllClose(
+        self.evaluate(math_ops.reduce_std(x, axis=0)), [0, 0, 0])
+
+    x = np.array([[1, 2, 1, 1], [1, 1, 0, 1]], "float32")
+    self.assertAllClose(self.evaluate(math_ops.reduce_std(x)), 0.5)
+
 
 class LogSumExpTest(test_util.TensorFlowTestCase):
 
diff --git a/tensorflow/python/ops/metrics_impl.py b/tensorflow/python/ops/metrics_impl.py
index d0919bdbe46..e86a3b85360 100644
--- a/tensorflow/python/ops/metrics_impl.py
+++ b/tensorflow/python/ops/metrics_impl.py
@@ -225,7 +225,7 @@ def _safe_div(numerator, denominator, name):
     0 if `denominator` <= 0, else `numerator` / `denominator`
   """
   if compat.forward_compatible(2018, 11, 1):
-    return math_ops.div_no_nan(numerator, denominator)
+    return math_ops.div_no_nan(numerator, denominator, name=name)
   t = math_ops.truediv(numerator, denominator)
   zero = array_ops.zeros_like(t, dtype=denominator.dtype)
   condition = math_ops.greater(denominator, zero)
diff --git a/tensorflow/python/ops/nn_ops.py b/tensorflow/python/ops/nn_ops.py
index 74343f832bc..bc195993c2e 100644
--- a/tensorflow/python/ops/nn_ops.py
+++ b/tensorflow/python/ops/nn_ops.py
@@ -25,6 +25,7 @@ import numpy as np
 from tensorflow.python.compat import compat
 from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import graph_util
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
@@ -1684,6 +1685,16 @@ def _softmax(logits, compute_op, dim=-1, name=None):
   if is_last_dim:
     return compute_op(logits, name=name)
 
+  dim_val = dim
+  if isinstance(dim, ops.Tensor):
+    dim_val = tensor_util.constant_value(dim)
+  if dim_val is not None and (dim_val < -shape.ndims or dim_val >= shape.ndims):
+    raise errors_impl.InvalidArgumentError(
+        None, None,
+        "Dimension (%d) must be in the range [%d, %d) where %d is the number of"
+        " dimensions in the input." % (dim_val, -shape.ndims, shape.ndims,
+                                       shape.ndims))
+
   # If dim is not the last dimension, we have to do a transpose so that we can
   # still perform softmax on its last dimension.
 
diff --git a/tensorflow/python/ops/nn_test.py b/tensorflow/python/ops/nn_test.py
index 312e1f08791..152b2020ebb 100644
--- a/tensorflow/python/ops/nn_test.py
+++ b/tensorflow/python/ops/nn_test.py
@@ -115,7 +115,7 @@ class SoftmaxTest(test_lib.TestCase, parameterized.TestCase):
     arr = np.linspace(0., 1, 12).reshape(3, 4)
     x_neg_axis = nn_ops.softmax(arr, axis=-2)
     y_pos_axis = nn_ops.softmax(arr, axis=0)
-    z_gt_axis = nn_ops.softmax(arr, axis=4)
+    z_gt_axis = nn_ops.softmax(arr, axis=0)
     x_neg_axis_tf = self.evaluate(x_neg_axis)
     y_pos_axis_tf = self.evaluate(y_pos_axis)
     z_gt_axis_tf = self.evaluate(z_gt_axis)
@@ -200,7 +200,7 @@ class LogSoftmaxTest(test_lib.TestCase, parameterized.TestCase):
     arr = np.linspace(0., 1, 12).reshape(3, 4)
     x_neg_axis = nn_ops.log_softmax(arr, axis=-2)
     y_pos_axis = nn_ops.log_softmax(arr, axis=0)
-    z_gt_axis = nn_ops.log_softmax(arr, axis=4)
+    z_gt_axis = nn_ops.log_softmax(arr, axis=0)
     x_neg_axis_tf = self.evaluate(x_neg_axis)
     y_pos_axis_tf = self.evaluate(y_pos_axis)
     z_gt_axis_tf = self.evaluate(z_gt_axis)
diff --git a/tensorflow/python/ops/numerics.py b/tensorflow/python/ops/numerics.py
index 0c0d81afb63..1a235de90cf 100644
--- a/tensorflow/python/ops/numerics.py
+++ b/tensorflow/python/ops/numerics.py
@@ -51,7 +51,7 @@ def verify_tensor_all_finite(t, msg, name=None):
   return out
 
 
-@tf_export("add_check_numerics_ops")
+@tf_export(v1=["add_check_numerics_ops"])
 def add_check_numerics_ops():
   """Connect a `check_numerics` to every floating point tensor.
 
diff --git a/tensorflow/python/ops/parallel_for/control_flow_ops_test.py b/tensorflow/python/ops/parallel_for/control_flow_ops_test.py
index e86f409d68b..171369b724a 100644
--- a/tensorflow/python/ops/parallel_for/control_flow_ops_test.py
+++ b/tensorflow/python/ops/parallel_for/control_flow_ops_test.py
@@ -309,6 +309,14 @@ class ArrayTest(PForTest):
 
     self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.float32] * 2)
 
+  def test_matrix_diag_part(self):
+    x = random_ops.random_uniform([3, 4, 2])
+
+    def loop_fn(i):
+      return array_ops.matrix_diag_part(array_ops.gather(x, i))
+
+    self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.float32])
+
   def test_strided_slice(self):
     x = random_ops.random_uniform([3, 3, 4, 4, 2, 2, 2])
 
@@ -792,9 +800,12 @@ class NNTest(PForTest):
       output = nn.max_pool(
           x1, ksize, strides=[1, 2, 2, 1], padding="VALID", data_format="NHWC")
       loss = nn.l2_loss(output)
-      return output, gradient_ops.gradients(loss, x1)
+      ones = array_ops.ones_like(output)
+      grad = gradient_ops.gradients(loss, x1, grad_ys=ones)
+      grad_grad = gradient_ops.gradients(grad, ones)
+      return output, grad, grad_grad
 
-    self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.float32] * 2)
+    self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.float32] * 3)
 
   def test_fused_batch_norm(self):
     data_formats = ["NHWC"]
diff --git a/tensorflow/python/ops/parallel_for/pfor.py b/tensorflow/python/ops/parallel_for/pfor.py
index 5d10860e943..e6f140a9410 100644
--- a/tensorflow/python/ops/parallel_for/pfor.py
+++ b/tensorflow/python/ops/parallel_for/pfor.py
@@ -1304,6 +1304,7 @@ def _inputs_with_flattening(pfor_input, input_indices):
 @RegisterPForWithArgs("AvgPool", dims=[0])
 @RegisterPForWithArgs("MaxPool", dims=[0])
 @RegisterPForWithArgs("MaxPoolGrad", dims=[0, 1, 2])
+@RegisterPForWithArgs("MaxPoolGradGrad", dims=[0, 1, 2])
 @RegisterPForWithArgs("SoftmaxCrossEntropyWithLogits", dims=[0, 1])
 def _convert_flatten_batch(pfor_input, op_type, dims):
   del op_type
@@ -1532,6 +1533,7 @@ def _convert_conv2d_backprop_filter(pfor_input):
 
 @RegisterPForWithArgs("Identity", array_ops.identity)
 @RegisterPForWithArgs("StopGradient", array_ops.stop_gradient)
+@RegisterPForWithArgs("MatrixDiagPart", array_ops.matrix_diag_part)
 def _convert_identity(pfor_input, op_type, op_func):
   del op_type
   return wrap(op_func(*[x.t for x in pfor_input.inputs]), True)
diff --git a/tensorflow/python/ops/parsing_ops.py b/tensorflow/python/ops/parsing_ops.py
index a99fa7f1526..484caf01796 100644
--- a/tensorflow/python/ops/parsing_ops.py
+++ b/tensorflow/python/ops/parsing_ops.py
@@ -46,7 +46,7 @@ ops.NotDifferentiable("SerializeTensor")
 ops.NotDifferentiable("StringToNumber")
 
 
-@tf_export("io.VarLenFeature", "VarLenFeature")
+@tf_export("io.VarLenFeature", v1=["VarLenFeature", "io.VarLenFeature"])
 class VarLenFeature(collections.namedtuple("VarLenFeature", ["dtype"])):
   """Configuration for parsing a variable-length input feature.
 
@@ -56,7 +56,7 @@ class VarLenFeature(collections.namedtuple("VarLenFeature", ["dtype"])):
   pass
 
 
-@tf_export("io.SparseFeature", "SparseFeature")
+@tf_export("io.SparseFeature", v1=["io.SparseFeature", "SparseFeature"])
 class SparseFeature(
     collections.namedtuple(
         "SparseFeature",
@@ -131,7 +131,7 @@ class SparseFeature(
         cls, index_key, value_key, dtype, size, already_sorted)
 
 
-@tf_export("io.FixedLenFeature", "FixedLenFeature")
+@tf_export("io.FixedLenFeature", v1=["io.FixedLenFeature", "FixedLenFeature"])
 class FixedLenFeature(collections.namedtuple(
     "FixedLenFeature", ["shape", "dtype", "default_value"])):
   """Configuration for parsing a fixed-length input feature.
@@ -151,7 +151,8 @@ class FixedLenFeature(collections.namedtuple(
         cls, shape, dtype, default_value)
 
 
-@tf_export("io.FixedLenSequenceFeature", "FixedLenSequenceFeature")
+@tf_export("io.FixedLenSequenceFeature",
+           v1=["io.FixedLenSequenceFeature", "FixedLenSequenceFeature"])
 class FixedLenSequenceFeature(collections.namedtuple(
     "FixedLenSequenceFeature",
     ["shape", "dtype", "allow_missing", "default_value"])):
@@ -362,7 +363,7 @@ def _prepend_none_dimension(features):
     return features
 
 
-@tf_export("io.parse_example", "parse_example")
+@tf_export("io.parse_example", v1=["io.parse_example", "parse_example"])
 def parse_example(serialized, features, name=None, example_names=None):
   # pylint: disable=line-too-long
   """Parses `Example` protos into a `dict` of tensors.
@@ -763,7 +764,8 @@ def _process_raw_parameters(names, dense_defaults, sparse_keys, sparse_types,
           dense_shapes_as_proto, dense_shapes)
 
 
-@tf_export("io.parse_single_example", "parse_single_example")
+@tf_export("io.parse_single_example",
+           v1=["io.parse_single_example", "parse_single_example"])
 def parse_single_example(serialized, features, name=None, example_names=None):
   """Parses a single `Example` proto.
 
@@ -1246,7 +1248,9 @@ def _parse_sequence_example_raw(serialized,
 
 # TODO(sundberg): rewrite this method to call the batch version, which is more
 # efficient especially for large inputs.
-@tf_export("io.parse_single_sequence_example", "parse_single_sequence_example")
+@tf_export("io.parse_single_sequence_example",
+           v1=["io.parse_single_sequence_example",
+               "parse_single_sequence_example"])
 def parse_single_sequence_example(
     serialized, context_features=None, sequence_features=None,
     example_name=None, name=None):
diff --git a/tensorflow/python/ops/ragged/BUILD b/tensorflow/python/ops/ragged/BUILD
new file mode 100644
index 00000000000..152c6dc8416
--- /dev/null
+++ b/tensorflow/python/ops/ragged/BUILD
@@ -0,0 +1,683 @@
+package(
+    default_visibility = [
+        "//intelligence/datum/prensor:__pkg__",
+        "//learning/brain/contrib/text:__pkg__",
+        "//nlp/projects/atc/tf/ops:__pkg__",
+        "//tensorflow:internal",
+    ],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+load("//tensorflow:tensorflow.bzl", "py_test")
+
+#-------------------------------------------------------------------------------
+# RaggedTensor
+#-------------------------------------------------------------------------------
+
+py_library(
+    name = "ragged",
+    srcs = ["__init__.py"],
+    srcs_version = "PY2AND3",
+    tags = ["nofixdeps"],
+    deps = [
+        ":ragged_array_ops",
+        ":ragged_conversion_ops",
+        ":ragged_elementwise_ops",
+        ":ragged_factory_ops",
+        ":ragged_functional_ops",
+        ":ragged_getitem",
+        ":ragged_map_ops",
+        ":ragged_math_ops",
+        ":ragged_operators",
+        ":ragged_tensor",
+        ":ragged_tensor_value",
+        ":ragged_util",
+        ":segment_id_ops",
+        "//tensorflow/python:util",
+    ],
+)
+
+py_library(
+    name = "ragged_array_ops",
+    srcs = ["ragged_array_ops.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged_conversion_ops",
+        ":ragged_factory_ops",
+        ":ragged_functional_ops",
+        ":ragged_math_ops",
+        ":ragged_tensor",
+        ":ragged_util",
+        ":segment_id_ops",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:check_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:ragged_array_ops_gen",
+        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python:tensor_util",
+    ],
+)
+
+py_library(
+    name = "ragged_conversion_ops",
+    srcs = ["ragged_conversion_ops.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged_factory_ops",
+        ":ragged_tensor",
+        ":ragged_util",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:ragged_conversion_ops_gen",
+        "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python:tensor_shape",
+    ],
+)
+
+py_library(
+    name = "ragged_factory_ops",
+    srcs = ["ragged_factory_ops.py"],
+    deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:tensor_util",
+        "//tensorflow/python/ops/ragged:ragged_tensor",
+        "//tensorflow/python/ops/ragged:ragged_tensor_value",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_library(
+    name = "ragged_functional_ops",
+    srcs = ["ragged_functional_ops.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged_factory_ops",
+        ":ragged_tensor",
+        ":ragged_util",
+        "//tensorflow/python:framework_ops",
+    ],
+)
+
+py_library(
+    name = "ragged_getitem",
+    srcs = ["ragged_getitem.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged_array_ops",
+        ":ragged_factory_ops",
+        ":ragged_math_ops",
+        ":ragged_tensor",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+    ],
+)
+
+py_library(
+    name = "ragged_math_ops",
+    srcs = ["ragged_math_ops.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged_factory_ops",
+        ":ragged_functional_ops",
+        ":ragged_tensor",
+        ":ragged_util",
+        ":segment_id_ops",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:check_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:ragged_math_ops_gen",
+        "//tensorflow/python:tensor_util",
+    ],
+)
+
+py_library(
+    name = "ragged_elementwise_ops",
+    srcs = ["ragged_elementwise_ops.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged_factory_ops",
+        ":ragged_tensor",
+        ":ragged_util",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:clip_ops",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:parsing_ops",
+        "//tensorflow/python:string_ops",
+        "//tensorflow/python:util",
+    ],
+)
+
+py_library(
+    name = "ragged_operators",
+    srcs = ["ragged_operators.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged_elementwise_ops",
+        ":ragged_getitem",
+        ":ragged_tensor",
+        "//tensorflow/python:util",
+    ],
+)
+
+py_library(
+    name = "ragged_tensor",
+    srcs = ["ragged_tensor.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged_tensor_value",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:session",
+        "//tensorflow/python:tensor_shape",
+    ],
+)
+
+py_library(
+    name = "ragged_tensor_value",
+    srcs = ["ragged_tensor_value.py"],
+    srcs_version = "PY2AND3",
+    deps = ["//third_party/py/numpy"],
+)
+
+py_library(
+    name = "ragged_util",
+    srcs = ["ragged_util.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:check_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+    ],
+)
+
+py_library(
+    name = "segment_id_ops",
+    srcs = ["segment_id_ops.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged_util",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python:tensor_util",
+    ],
+)
+
+py_library(
+    name = "ragged_map_ops",
+    srcs = ["ragged_map_ops.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged_array_ops",
+        ":ragged_factory_ops",
+        ":ragged_tensor",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python:tensor_array_ops",
+        "//tensorflow/python:util",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python/eager:context",
+    ],
+)
+
+#-------------------------------------------------------------------------------
+# RaggedTensor Tests
+#-------------------------------------------------------------------------------
+
+py_test(
+    name = "ragged_tensor_test",
+    size = "medium",
+    srcs = ["ragged_tensor_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+        "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+py_test(
+    name = "ragged_eager_test",
+    size = "medium",
+    srcs = ["ragged_eager_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+py_test(
+    name = "ragged_range_op_test",
+    srcs = ["ragged_range_op_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
+py_test(
+    name = "ragged_tensor_bounding_shape_op_test",
+    srcs = ["ragged_tensor_bounding_shape_op_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
+py_test(
+    name = "ragged_row_lengths_op_test",
+    srcs = ["ragged_row_lengths_op_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+py_test(
+    name = "ragged_gather_op_test",
+    srcs = ["ragged_gather_op_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
+py_test(
+    name = "ragged_batch_gather_op_test",
+    srcs = ["ragged_batch_gather_op_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+py_test(
+    name = "ragged_gather_nd_op_test",
+    srcs = ["ragged_gather_nd_op_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+        "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+py_test(
+    name = "ragged_row_splits_to_segment_ids_op_test",
+    srcs = ["ragged_row_splits_to_segment_ids_op_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
+py_test(
+    name = "ragged_segment_ids_to_row_splits_op_test",
+    srcs = ["ragged_segment_ids_to_row_splits_op_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
+py_test(
+    name = "ragged_from_tensor_op_test",
+    srcs = ["ragged_from_tensor_op_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+py_test(
+    name = "ragged_to_sparse_op_test",
+    srcs = ["ragged_to_sparse_op_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:gradients_impl",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
+py_test(
+    name = "ragged_from_sparse_op_test",
+    srcs = ["ragged_from_sparse_op_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:sparse_tensor",
+    ],
+)
+
+py_test(
+    name = "ragged_to_tensor_op_test",
+    srcs = ["ragged_to_tensor_op_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+py_test(
+    name = "ragged_segment_op_test",
+    srcs = ["ragged_segment_op_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+py_test(
+    name = "ragged_reduce_op_test",
+    srcs = ["ragged_reduce_op_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+        "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+py_test(
+    name = "ragged_map_inner_values_op_test",
+    srcs = ["ragged_map_inner_values_op_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+py_test(
+    name = "ragged_const_op_test",
+    srcs = ["ragged_const_op_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged_factory_ops",
+        ":ragged_tensor",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+py_test(
+    name = "ragged_constant_value_op_test",
+    srcs = ["ragged_constant_value_op_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+        "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+py_test(
+    name = "convert_to_tensor_or_ragged_tensor_op_test",
+    srcs = ["convert_to_tensor_or_ragged_tensor_op_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+        "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+py_test(
+    name = "ragged_boolean_mask_op_test",
+    srcs = ["ragged_boolean_mask_op_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+py_test(
+    name = "ragged_concat_op_test",
+    srcs = ["ragged_concat_op_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+py_test(
+    name = "ragged_stack_op_test",
+    srcs = ["ragged_stack_op_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+py_test(
+    name = "ragged_tile_op_test",
+    srcs = ["ragged_tile_op_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged_array_ops",
+        ":ragged_factory_ops",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+py_test(
+    name = "ragged_util_test",
+    srcs = ["ragged_util_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged_util",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+        "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+py_test(
+    name = "ragged_expand_dims_op_test",
+    srcs = ["ragged_expand_dims_op_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+py_test(
+    name = "ragged_where_op_test",
+    srcs = ["ragged_where_op_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+py_test(
+    name = "ragged_elementwise_ops_test",
+    srcs = ["ragged_elementwise_ops_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+        "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+py_test(
+    name = "ragged_operators_test",
+    srcs = ["ragged_operators_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
+py_test(
+    name = "ragged_map_fn_op_test",
+    size = "small",
+    srcs = ["ragged_map_fn_op_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:string_ops",
+        "//tensorflow/python/keras:backend",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
diff --git a/tensorflow/python/ops/ragged/__init__.py b/tensorflow/python/ops/ragged/__init__.py
new file mode 100644
index 00000000000..3a288485454
--- /dev/null
+++ b/tensorflow/python/ops/ragged/__init__.py
@@ -0,0 +1,226 @@
+"""Ragged Tensors.
+
+This package defines the [`RaggedTensor`](ragged/RaggedTensor.md) class, which
+represents tensors with non-uniform shapes.  In particular, each `RaggedTensor`
+has one or more *ragged dimensions*, which are dimensions whose slices may have
+different lengths.  For example, the inner (column) dimension of
+`rt=[[3, 1, 4, 1], [], [5, 9, 2], [6], []]` is ragged, since the column slices
+(`rt[0, :]`, ..., `rt[4, :]`) have different lengths.  For a more detailed
+description of ragged tensors, see the [`RaggedTensor`](ragged/RaggedTensor.md)
+class documentation.
+
+## RaggedTensor Operations
+
+This package also defines a collection of operations for manipulating
+ragged tensors.
+
+### RaggedTensor Versions of Standard Tensor Operations
+
+Many of the operations defined by this package are analogous to
+[`Tensor`](https://www.tensorflow.org/api_docs/python/tf/Tensor)
+operations, but they accept `RaggedTensor`s as input and can return
+`RaggedTensor`s as output.  For example, `ragged.add` performs elementwise
+addition just like `tf.add`, but can be used on `RaggedTensor`s.
+
+These `RaggedTensor` versions of the standard `Tensor` operations can also be
+used with standard `Tensors`; and for the most part, they will return the same
+value that the standard `Tensor` operation would return.  However, there are
+a few notable exceptions:
+
+* For [`ragged.stack(...)`](ragged/stack.md) and
+  [`ragged.concat(...)`](ragged/concat.md), the input tensors are not required
+  to have matching shapes.  In the returned tensor, all dimensions up to the
+  `axis` dimension will be ragged.
+
+### Ragged-Tensor Specific Operations
+
+The following operations are specific to ragged tensors:
+
+* **Factory ops**:
+  [`constant(...)`](ragged/constant.md),
+  [`from_row_splits(...)`](ragged/from_row_splits.md),
+  [`from_row_lengths(...)`](ragged/from_row_lengths.md),
+  [`from_row_starts(...)`](ragged/from_row_starts.md),
+  [`from_row_limits(...)`](ragged/from_row_limits.md),
+  [`from_value_rowids(...)`](ragged/from_value_rowids.md),
+  [`from_nested_row_splits(...)`](ragged/from_nested_row_splits.md),
+  [`from_nested_value_rowids(...)`](ragged/from_nested_value_rowids.md).
+
+* **Conversion ops**:
+  [`from_tensor(...)`](ragged/from_tensor.md),
+  [`to_tensor(...)`](ragged/to_tensor.md),
+  [`from_sparse(...)`](ragged/from_sparse.md),
+  [`to_sparse(...)`](ragged/to_sparse.md),
+  [`from_variant(...)`](ragged/from_variant.md),
+  [`to_variant(...)`](ragged/to_variant.md),
+  [`convert_to_tensor_or_ragged_tensor(...)`](
+  ragged/convert_to_tensor_or_ragged_tensor.md).
+
+* **Shape ops**:
+  [`row_splits(...)`](ragged/row_splits.md),
+  [`row_lengths(...)`](ragged/row_lengths.md),
+  [`row_starts(...)`](ragged/row_starts.md),
+  [`row_limits(...)`](ragged/row_limits.md),
+  [`value_rowids(...)`](ragged/value_rowids.md),
+  [`nrows(...)`](ragged/nrows.md),
+  [`nested_row_splits(...)`](ragged/nested_row_splits.md),
+  [`row_splits_to_segment_ids(...)`](ragged/row_splits_to_segment_ids.md),
+  [`segment_ids_to_row_splits(...)`](ragged/segment_ids_to_row_splits.md),
+  [`bounding_shape(...)`](ragged/bounding_shape.md).
+
+* **Functional ops**:
+  [`map_inner_values(...)`](ragged/map_inner_values.md),
+  [`make_elementwise_op(...)`](ragged/make_elementwise_op.md).
+
+
+<!-- Ragged Classes & related helper functions -->
+@@RaggedTensor
+@@RaggedTensorType
+@@RaggedTensorValue
+@@is_ragged
+
+<!-- Factory Ops -->
+@@constant
+@@constant_value
+@@from_row_splits
+@@from_row_lengths
+@@from_row_starts
+@@from_row_limits
+@@from_value_rowids
+@@from_nested_row_splits
+@@from_nested_value_rowids
+@@convert_to_tensor_or_ragged_tensor
+
+<!-- Conversion Ops -->
+@@from_tensor
+@@to_tensor
+@@from_sparse
+@@to_sparse
+@@row_splits_to_segment_ids
+@@segment_ids_to_row_splits
+
+<!-- Array Ops -->
+@@row_splits
+@@row_lengths
+@@row_starts
+@@row_limits
+@@value_rowids
+@@nrows
+@@nested_row_splits
+@@bounding_shape
+@@gather
+@@batch_gather
+@@gather_nd
+@@boolean_mask
+@@concat
+@@stack
+@@tile
+@@expand_dims
+@@where
+
+<!-- Math Ops -->
+@@range
+
+@@segment_sum
+@@segment_prod
+@@segment_min
+@@segment_max
+@@segment_mean
+@@segment_sqrt_n
+
+@@reduce_sum
+@@reduce_prod
+@@reduce_min
+@@reduce_max
+@@reduce_mean
+@@reduce_all
+@@reduce_any
+
+<!-- Functional Ops -->
+@@map_inner_values
+@@map_fn
+
+<!-- Elementwise Ops -->
+@@make_elementwise_op
+
+<!-- Symbols from  ragged_elementwise_ops._symbols_to_export are whitelisted -->
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.ops.ragged import ragged_operators
+
+from tensorflow.python.ops.ragged.ragged_array_ops import batch_gather
+from tensorflow.python.ops.ragged.ragged_array_ops import boolean_mask
+from tensorflow.python.ops.ragged.ragged_array_ops import bounding_shape
+from tensorflow.python.ops.ragged.ragged_array_ops import concat
+from tensorflow.python.ops.ragged.ragged_array_ops import expand_dims
+from tensorflow.python.ops.ragged.ragged_array_ops import gather
+from tensorflow.python.ops.ragged.ragged_array_ops import gather_nd
+from tensorflow.python.ops.ragged.ragged_array_ops import nrows
+from tensorflow.python.ops.ragged.ragged_array_ops import row_lengths
+from tensorflow.python.ops.ragged.ragged_array_ops import row_limits
+from tensorflow.python.ops.ragged.ragged_array_ops import row_starts
+from tensorflow.python.ops.ragged.ragged_array_ops import stack
+from tensorflow.python.ops.ragged.ragged_array_ops import tile
+from tensorflow.python.ops.ragged.ragged_array_ops import value_rowids
+from tensorflow.python.ops.ragged.ragged_array_ops import where
+
+from tensorflow.python.ops.ragged.ragged_conversion_ops import from_sparse
+from tensorflow.python.ops.ragged.ragged_conversion_ops import from_tensor
+from tensorflow.python.ops.ragged.ragged_conversion_ops import to_sparse
+from tensorflow.python.ops.ragged.ragged_conversion_ops import to_tensor
+
+# pylint: disable=protected-access, wildcard-import
+from tensorflow.python.ops.ragged.ragged_elementwise_ops import *
+from tensorflow.python.ops.ragged.ragged_elementwise_ops import _symbols_to_export as _elementwise_ops
+# pylint: enable=protected-access, wildcard-import
+
+from tensorflow.python.ops.ragged.ragged_factory_ops import constant
+from tensorflow.python.ops.ragged.ragged_factory_ops import constant_value
+from tensorflow.python.ops.ragged.ragged_factory_ops import convert_to_tensor_or_ragged_tensor
+from tensorflow.python.ops.ragged.ragged_factory_ops import from_nested_row_splits
+from tensorflow.python.ops.ragged.ragged_factory_ops import from_nested_value_rowids
+from tensorflow.python.ops.ragged.ragged_factory_ops import from_row_lengths
+from tensorflow.python.ops.ragged.ragged_factory_ops import from_row_limits
+from tensorflow.python.ops.ragged.ragged_factory_ops import from_row_splits
+from tensorflow.python.ops.ragged.ragged_factory_ops import from_row_starts
+from tensorflow.python.ops.ragged.ragged_factory_ops import from_value_rowids
+
+from tensorflow.python.ops.ragged.ragged_functional_ops import map_inner_values
+
+from tensorflow.python.ops.ragged.ragged_map_ops import map_fn
+
+from tensorflow.python.ops.ragged.ragged_math_ops import range  # pylint: disable=redefined-builtin
+
+from tensorflow.python.ops.ragged.ragged_math_ops import reduce_all
+from tensorflow.python.ops.ragged.ragged_math_ops import reduce_any
+from tensorflow.python.ops.ragged.ragged_math_ops import reduce_max
+from tensorflow.python.ops.ragged.ragged_math_ops import reduce_mean
+from tensorflow.python.ops.ragged.ragged_math_ops import reduce_min
+from tensorflow.python.ops.ragged.ragged_math_ops import reduce_prod
+from tensorflow.python.ops.ragged.ragged_math_ops import reduce_sum
+
+from tensorflow.python.ops.ragged.ragged_math_ops import segment_max
+from tensorflow.python.ops.ragged.ragged_math_ops import segment_mean
+from tensorflow.python.ops.ragged.ragged_math_ops import segment_min
+from tensorflow.python.ops.ragged.ragged_math_ops import segment_prod
+from tensorflow.python.ops.ragged.ragged_math_ops import segment_sqrt_n
+from tensorflow.python.ops.ragged.ragged_math_ops import segment_sum
+
+from tensorflow.python.ops.ragged.ragged_tensor import is_ragged
+from tensorflow.python.ops.ragged.ragged_tensor import RaggedTensor
+from tensorflow.python.ops.ragged.ragged_tensor import RaggedTensorType
+
+from tensorflow.python.ops.ragged.ragged_tensor_value import RaggedTensorValue
+
+from tensorflow.python.ops.ragged.segment_id_ops import row_splits_to_segment_ids
+from tensorflow.python.ops.ragged.segment_id_ops import segment_ids_to_row_splits
+
+from tensorflow.python.util import all_util as _all_util
+
+# Any symbol that is not referenced (with "@@name") in the module docstring
+# above, or included in the "_elementwise_ops" whitelist, will be removed.
+_all_util.remove_undocumented(__name__, _elementwise_ops)
diff --git a/tensorflow/python/ops/ragged/convert_to_tensor_or_ragged_tensor_op_test.py b/tensorflow/python/ops/ragged/convert_to_tensor_or_ragged_tensor_op_test.py
new file mode 100644
index 00000000000..b43470dfa11
--- /dev/null
+++ b/tensorflow/python/ops/ragged/convert_to_tensor_or_ragged_tensor_op_test.py
@@ -0,0 +1,209 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for ragged.convert_to_tensor_or_ragged_tensor."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import ragged
+from tensorflow.python.platform import googletest
+
+
+class RaggedConvertToTensorOrRaggedTensorTest(test_util.TensorFlowTestCase,
+                                              parameterized.TestCase):
+
+  #=============================================================================
+  # Tests where the 'value' param is a RaggedTensor
+  #=============================================================================
+  @parameterized.parameters([
+      dict(pylist=[[1, 2], [3]]),
+      dict(pylist=[[1, 2], [3]], preferred_dtype=dtypes.float32),
+      dict(pylist=[[1, 2], [3]], preferred_dtype=dtypes.string),
+  ])
+  def testConvertRaggedTensor(self, pylist, dtype=None, preferred_dtype=None):
+    rt = ragged.constant(pylist)
+    converted = ragged.convert_to_tensor_or_ragged_tensor(
+        rt, dtype, preferred_dtype)
+    self.assertIs(converted, rt)
+
+  @parameterized.parameters([
+      dict(
+          pylist=[[1, 2], [3, 4]],
+          dtype=dtypes.float32,
+          message=('Tensor conversion requested dtype float32 for '
+                   'RaggedTensor with dtype int32')),
+      dict(
+          pylist=[[1, 2], [3, 4]],
+          dtype=dtypes.string,
+          message=('Tensor conversion requested dtype string for '
+                   'RaggedTensor with dtype .*')),
+  ])
+  def testConvertRaggedTensorError(self,
+                                   pylist,
+                                   message,
+                                   dtype=None,
+                                   preferred_dtype=None):
+    rt = ragged.constant(pylist)
+
+    with self.assertRaisesRegexp(ValueError, message):
+      ragged.convert_to_tensor_or_ragged_tensor(rt, dtype, preferred_dtype)
+
+  #=============================================================================
+  # Tests where the 'value' param is a RaggedTensorValue
+  #=============================================================================
+  @parameterized.parameters([
+      dict(
+          value=ragged.constant_value([[1, 2], [3]], dtype=np.int32),
+          expected_dtype=dtypes.int32),
+      dict(
+          value=ragged.constant_value([[b'a', b'b'], [b'c']]),
+          expected_dtype=dtypes.string),
+      dict(
+          value=ragged.constant_value([[1, 2], [3]], dtype=np.int32),
+          dtype=dtypes.float32,
+          expected_dtype=dtypes.float32),
+      dict(
+          value=ragged.constant_value([[1, 2], [3]], dtype=np.int32),
+          preferred_dtype=dtypes.float32,
+          expected_dtype=dtypes.float32),
+      dict(
+          value=ragged.constant_value([[1, 2], [3]], dtype=np.int32),
+          preferred_dtype=dtypes.string,
+          expected_dtype=dtypes.int32),
+  ])
+  def testConvertRaggedTensorValue(self,
+                                   value,
+                                   dtype=None,
+                                   preferred_dtype=None,
+                                   expected_dtype=None):
+    if expected_dtype is None:
+      expected_dtype = value.dtype if dtype is None else dtype
+    converted = ragged.convert_to_tensor_or_ragged_tensor(
+        value, dtype, preferred_dtype)
+    self.assertEqual(value.ragged_rank, converted.ragged_rank)
+    self.assertEqual(dtypes.as_dtype(expected_dtype), converted.dtype)
+    with self.test_session():
+      self.assertEqual(value.tolist(), converted.eval().tolist())
+
+  @parameterized.parameters([
+      dict(
+          value=ragged.constant_value([['a', 'b'], ['c']], dtype=str),
+          dtype=dtypes.int32,
+          message=r"invalid literal for int\(\) with base 10: 'a'"),
+  ])
+  def testConvertRaggedTensorValueError(self,
+                                        value,
+                                        message,
+                                        dtype=None,
+                                        preferred_dtype=None):
+    with self.assertRaisesRegexp(ValueError, message):
+      ragged.convert_to_tensor_or_ragged_tensor(value, dtype, preferred_dtype)
+
+  #=============================================================================
+  # Tests where the 'value' param is a Tensor
+  #=============================================================================
+  @parameterized.parameters([
+      dict(pylist=[[1, 2], [3, 4]]),
+      dict(pylist=[[1, 2], [3, 4]], preferred_dtype=dtypes.float32),
+      dict(pylist=[[1, 2], [3, 4]], preferred_dtype=dtypes.string),
+  ])
+  def testConvertTensor(self, pylist, dtype=None, preferred_dtype=None):
+    tensor = constant_op.constant(pylist)
+    converted = ragged.convert_to_tensor_or_ragged_tensor(
+        tensor, dtype, preferred_dtype)
+    with self.test_session():
+      self.assertIs(tensor, converted)
+
+  @parameterized.parameters([
+      dict(
+          pylist=[[1, 2], [3, 4]],
+          dtype=dtypes.float32,
+          message=('Tensor conversion requested dtype float32 for '
+                   'Tensor with dtype int32')),
+      dict(
+          pylist=[[1, 2], [3, 4]],
+          dtype=dtypes.string,
+          message=('Tensor conversion requested dtype string for '
+                   'Tensor with dtype int32')),
+  ])
+  def testConvertTensorError(self,
+                             pylist,
+                             message,
+                             dtype=None,
+                             preferred_dtype=None):
+    tensor = constant_op.constant(pylist)
+    with self.assertRaisesRegexp(ValueError, message):
+      ragged.convert_to_tensor_or_ragged_tensor(tensor, dtype, preferred_dtype)
+
+  #=============================================================================
+  # Tests where the 'value' param is a np.array
+  #=============================================================================
+  @parameterized.parameters([
+      dict(
+          value=np.array([[1, 2], [3, 4]], dtype=np.int32),
+          expected_dtype=dtypes.int32),
+      dict(
+          value=np.array([[b'a', b'b'], [b'c', b'd']]),
+          expected_dtype=dtypes.string),
+      dict(
+          value=np.array([[1, 2], [3, 4]], dtype=np.int32),
+          dtype=dtypes.float32,
+          expected_dtype=dtypes.float32),
+      dict(
+          value=np.array([[1, 2], [3, 4]], dtype=np.int32),
+          preferred_dtype=dtypes.float32,
+          expected_dtype=dtypes.float32),
+      dict(
+          value=np.array([[1, 2], [3, 4]], dtype=np.int32),
+          preferred_dtype=dtypes.string,
+          expected_dtype=dtypes.int32),
+  ])
+  def testConvertNumpyArray(self,
+                            value,
+                            dtype=None,
+                            preferred_dtype=None,
+                            expected_dtype=None):
+    if expected_dtype is None:
+      expected_dtype = value.dtype if dtype is None else dtype
+    converted = ragged.convert_to_tensor_or_ragged_tensor(
+        value, dtype, preferred_dtype)
+    self.assertEqual(dtypes.as_dtype(expected_dtype), converted.dtype)
+    with self.test_session():
+      self.assertAllEqual(value, converted)
+
+  @parameterized.parameters([
+      dict(
+          value=np.array([['a', 'b'], ['c', 'd']], dtype=str),
+          dtype=dtypes.int32,
+          message=r"invalid literal for int\(\) with base 10: 'a'"),
+  ])
+  def testConvertNumpyArrayError(self,
+                                 value,
+                                 message,
+                                 dtype=None,
+                                 preferred_dtype=None):
+    with self.assertRaisesRegexp(ValueError, message):
+      ragged.convert_to_tensor_or_ragged_tensor(value, dtype, preferred_dtype)
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/python/ops/ragged/ragged_array_ops.py b/tensorflow/python/ops/ragged/ragged_array_ops.py
new file mode 100644
index 00000000000..425f3957c38
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_array_ops.py
@@ -0,0 +1,1493 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Array operations for RaggedTensors."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import gen_ragged_array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.ragged import ragged_conversion_ops
+from tensorflow.python.ops.ragged import ragged_factory_ops
+from tensorflow.python.ops.ragged import ragged_functional_ops
+from tensorflow.python.ops.ragged import ragged_math_ops
+from tensorflow.python.ops.ragged import ragged_tensor
+from tensorflow.python.ops.ragged import ragged_util
+from tensorflow.python.ops.ragged import segment_id_ops
+
+#===============================================================================
+# Row Partitioning
+#===============================================================================
+
+
+def value_rowids(rt_input, name=None):
+  """Returns the row indices for the `values` in the given ragged tensor.
+
+  `value_rowids(rt)` corresponds one-to-one with the outermost dimension of
+  `rt.values`, and specifies the row containing each value.  In particular,
+  the row `rt[row]` consists of the values `rt.values[j]` where
+  `value_rowids(rt)[j] == row`.
+
+  Args:
+    rt_input: The RaggedTensor whose row indices should be returned.
+    name: A name prefix for the returned tensor (optional).
+
+  Returns:
+    A 1-D `int64` `Tensor` with shape `self.values.shape[:1]`.
+    The returned tensor is nonnegative, and is sorted in ascending order.
+
+  #### Example:
+    ```python
+    >>> rt = ragged.constant([[3, 1, 4, 1], [], [5, 9, 2], [6], []])
+    >>> rt.values.eval()
+    [3, 1, 4, 1, 5, 9, 2, 6]
+    >>> ragged.value_rowids(rt).eval()
+    [0, 0, 0, 0, 2, 2, 2, 3]  # corresponds 1:1 with rt.values
+    ```
+  """
+  if not ragged_tensor.is_ragged(rt_input):
+    raise TypeError(
+        'rt_input expected RaggedTensor, got %s' % type(rt_input).__name__)
+  if (isinstance(rt_input, ragged_tensor.RaggedTensor) and
+      rt_input.cached_value_rowids is not None):
+    return rt_input.cached_value_rowids
+
+  with ops.name_scope(name, 'RaggedValueRowIds', [rt_input]):
+    return segment_id_ops.row_splits_to_segment_ids(rt_input.row_splits)
+
+
+def nrows(rt_input, out_type=dtypes.int64, name=None):
+  """Returns the number of rows in the given potentially ragged tensor.
+
+  I.e., the size of the outermost dimension of the tensor.
+
+  Args:
+    rt_input: The potentially ragged tensor whose number of rows should be
+      returned.
+    out_type: `dtype` for the returned tensor.
+    name: A name prefix for the returned tensor (optional).
+
+  Returns:
+    A scalar `Tensor` with dtype `out_type`.
+
+  #### Example:
+    ```python
+    >>> rt = ragged.constant([[3, 1, 4, 1], [], [5, 9, 2], [6], []])
+    >>> ragged.nrows(rt).eval()  # rt has 5 rows.
+    5
+    ```
+  """
+  if (isinstance(rt_input, ragged_tensor.RaggedTensor) and
+      rt_input.cached_nrows is not None):
+    return rt_input.cached_nrows
+
+  with ops.name_scope(name, 'RaggedNRows', [rt_input]):
+    if ragged_tensor.is_ragged(rt_input):
+      return array_ops.shape(rt_input.row_splits, out_type=out_type)[0] - 1
+    else:
+      return array_ops.shape(rt_input, out_type=out_type)[0]
+
+
+def row_starts(rt_input, name=None):
+  """Returns the start indices for rows in the given ragged tensor.
+
+  These indices specify where the values for each row begin in
+  `rt_input.values`.  `ragged.row_starts(rt_input)` is equal to
+  `rt_input.row_splits[:-1]`.
+
+  Args:
+    rt_input: The RaggedTensor whose row starts should be returned.
+    name: A name prefix for the returned tensor (optional).
+
+  Returns:
+    A 1-D Tensor of int64 with shape `[nrows]`.
+    The returned tensor is nonnegative, and is sorted in ascending order.
+
+  #### Example:
+    ```python
+    >>> rt = ragged.constant([[3, 1, 4, 1], [], [5, 9, 2], [6], []])
+    >>> ragged.values(rt).eval()
+    [3, 1, 4, 1, 5, 9, 2, 6]
+    >>> ragged.row_starts(rt).eval()  # indices of row starts in ragged.values
+    [0, 4, 4, 7, 8]
+    ```
+  """
+  if not ragged_tensor.is_ragged(rt_input):
+    raise TypeError(
+        'rt_input expected RaggedTensor, got %s' % type(rt_input).__name__)
+  with ops.name_scope(name, 'RaggedRowStarts', [rt_input]):
+    return rt_input.row_splits[:-1]
+
+
+def row_limits(rt_input, name=None):
+  """Returns the limit indices for rows in the given ragged tensor.
+
+  These indices specify where the values for each row end in
+  `rt_input.values`.  `ragged.row_limits(rt_input)` is equal to
+  `rt_input.row_splits[:-1]`.
+
+  Args:
+    rt_input: The RaggedTensor whose row limits should be returned.
+    name: A name prefix for the returned tensor (optional).
+
+  Returns:
+    A 1-D Tensor of int64 with shape `[nrows]`.
+    The returned tensor is nonnegative, and is sorted in ascending order.
+
+  #### Example:
+    ```python
+    >>> rt = ragged.constant([[3, 1, 4, 1], [], [5, 9, 2], [6], []])
+    >>> ragged.values(rt).eval()
+    [3, 1, 4, 1, 5, 9, 2, 6]
+    >>> ragged.row_limits(rt).eval()  # indices of row limits in ragged.values
+    [4, 4, 7, 8, 8]
+    ```
+  """
+  if not ragged_tensor.is_ragged(rt_input):
+    raise TypeError(
+        'rt_input expected RaggedTensor, got %s' % type(rt_input).__name__)
+  with ops.name_scope(name, 'RaggedRowLimits', [rt_input]):
+    return rt_input.row_splits[1:]
+
+
+def row_lengths(rt_input, axis=1, name=None):
+  """Returns the lengths of the rows in the given potentially ragged tensor.
+
+  `ragged.row_lengths(rt_input)[i]` indicates the number of values in the
+  `i`th row of `rt_input`.
+
+  Args:
+    rt_input: The potentially ragged tensor whose row lengths should be
+      returned.  Must have at least `axis+1` dimensions.
+    axis: An integer constant indicating the axis whose row lengths should be
+      returned.
+    name: A name prefix for the returned tensor (optional).
+
+  Returns:
+    A potentially Tensor of int64 with shape `rt_input.shape[:axis]`.
+
+  Raises:
+    ValueError: If rt_input is a scalar, or `axis` is out of bounds.
+
+  #### Example:
+    ```python
+    >>> rt = ragged.constant([[[3, 1, 4], [1]], [], [[5, 9], [2]], [[6]], []])
+    >>> ragged.row_lengths(rt).eval()  # lengths of rows in rt
+    [2, 0, 2, 1, 0]
+    >>> ragged.row_lengths(rt, axis=2).eval()  # lengths of axis=2 rows.
+    [[3, 1], [], [2, 1], [1], []]
+    ```
+  """
+  if (isinstance(rt_input, ragged_tensor.RaggedTensor) and
+      rt_input.cached_row_lengths is not None):
+    return rt_input.cached_row_lengths
+
+  with ops.name_scope(name, 'RaggedRowLengths', [rt_input]):
+    rt_input = ragged_factory_ops.convert_to_tensor_or_ragged_tensor(
+        rt_input, name='rt_input')
+    ndims = rt_input.shape.ndims
+    if ndims is not None:
+      if ndims == 0:
+        raise ValueError('rt_input may not be a scalar.')
+      elif not -ndims <= axis < ndims:
+        raise ValueError('axis=%d out of bounds: expected %d<=axis<%d.' %
+                         (axis, -ndims, ndims))
+    if ragged_tensor.is_ragged(rt_input):
+      axis = ragged_util.get_positive_axis(axis, rt_input.shape.ndims)
+      if axis == 0:
+        return nrows(rt_input)
+      elif axis == 1:
+        splits = rt_input.row_splits
+        return splits[1:] - splits[:-1]
+      else:
+        return rt_input.with_values(row_lengths(rt_input.values, axis - 1))
+    else:
+      shape = array_ops.shape(rt_input, out_type=dtypes.int64)
+      return array_ops.ones(shape[:axis], dtypes.int64) * shape[axis]
+
+
+#===============================================================================
+# Bounding Shape
+#===============================================================================
+def bounding_shape(rt_input, axis=None, name=None):
+  """Returns the tight bounding box shape for a potentially ragged tensor.
+
+  Args:
+    rt_input: A potentially ragged tensor.
+    axis: An integer scalar or vector indicating which axes to return the
+      bounding box for.  If not specified, then the full bounding box is
+      returned.
+    name: A name prefix for the returned tensor (optional).
+
+  Returns:
+    An int64 `Tensor`.  If `axis` is not specified, then `output`
+    is a vector with `output.shape=[rt_input.shape.ndims]`.  If `axis` is a
+    scalar, then the `output` is a scalar.  If `axis` is a vector, then
+    `output` is a vector, where `output[i]` is the bounding size for
+    dimension `axis[i]`.
+
+  #### Example:
+    ```python
+    >>> rt = ragged.constant([[1, 2, 3, 4], [5], [], [6, 7, 8, 9], [10]])
+    >>> ragged.bounding_shape(rt).eval().tolist()
+    [5, 4]
+    ```
+  """
+  with ops.name_scope(name, 'RaggedBoundingBox', [rt_input, axis]):
+    rt_input = ragged_factory_ops.convert_to_tensor_or_ragged_tensor(
+        rt_input, name='rt_input')
+    if not ragged_tensor.is_ragged(rt_input):
+      bbox = array_ops.shape(rt_input)
+      return bbox if axis is None else array_ops.gather(bbox, axis)
+
+    nested_splits = rt_input.nested_row_splits
+    rt_inner_values = rt_input.inner_values
+
+    # Optimized special cases for when axis=0 or axis=1:
+    if isinstance(axis, int):
+      if axis == 0:
+        return array_ops.shape(nested_splits[0], out_type=dtypes.int64)[0] - 1
+      elif axis == 1:
+        return math_ops.maximum(math_ops.reduce_max(row_lengths(rt_input)), 0)
+
+    splits_shape = array_ops.shape(rt_input.row_splits, out_type=dtypes.int64)
+    inner_values_shape = array_ops.shape(rt_inner_values, out_type=dtypes.int64)
+
+    ragged_dimensions = array_ops.stack([splits_shape[0] - 1] + [
+        math_ops.maximum(math_ops.reduce_max(splits[1:] - splits[:-1]), 0)
+        for splits in nested_splits
+    ])
+    inner_dimensions = inner_values_shape[1:]
+
+    bbox = array_ops.concat([ragged_dimensions, inner_dimensions], axis=0)
+    return bbox if axis is None else array_ops.gather(bbox, axis)
+
+
+#===============================================================================
+# ragged_gather
+#===============================================================================
+# TODO(edloper): Add an `axis` argument
+def gather(params, indices, name=None):
+  """Gathers ragged slices from `params` axis `0` according to `indices`.
+
+  Returns `RaggedTensor` output, such that:
+
+  ```python
+  output.shape = indices.shape + params.shape[1:]
+  output.ragged_rank = indices.shape.ndims + params.ragged_rank
+  output[i...j, d0...dn] = params[indices[i...j], d0...dn]
+  ```
+
+  `params` may be ragged.  `indices` may be ragged.
+  `indices` must have dtype `int32` or `int64`. If any index is out of bounds,
+  then an error is returned.
+
+  Examples:
+
+  ```python
+  >>> params = tf.constant(['a', 'b', 'c', 'd', 'e'])
+  >>> indices = tf.constant([3, 1, 2, 1, 0])
+  >>> ragged_params = ragged.constant([['a', 'b', 'c'], ['d'], [], ['e']])
+  >>> ragged_indices = ragged.constant([[3, 1, 2], [1], [], [0]])
+
+  >>> print ragged.gather(params, ragged_indices).eval().tolist()
+  [['d', 'b', 'c'], ['b'], [], ['a']]
+
+  >>> print ragged.gather(ragged_params, indices).eval().tolist()
+  [['e'], ['d'], [], ['d'], ['a', 'b', 'c']]
+
+  >>> print ragged.gather(ragged_params, ragged_indices).eval().tolist()
+  [[['e'], ['d'], []], [['d']], [], [['a', 'b', 'c']]]
+  ```
+
+  Args:
+    params: The potentially ragged tensor from which to gather values. Must be
+      at least rank 1.
+    indices: The potentially ragged tensor indicating which values to gather.
+      Must have dtype `int32` or `int64`.  Values must be in the range `[0,
+      params.shape[0]]`.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `RaggedTensor`, where `output.dtype=params.dtype` and
+    `output.shape=indices.shape + params.shape[1:]` and
+    `output.ragged_rank=indices.shape.ndims + params.ragged_rank`.
+
+  Raises:
+    ValueError: If indices.shape.ndims is not known statically.
+  """
+  with ops.name_scope(name, 'RaggedGather', [params, indices]):
+    params = ragged_factory_ops.convert_to_tensor_or_ragged_tensor(
+        params, name='params')
+    indices = ragged_factory_ops.convert_to_tensor_or_ragged_tensor(
+        indices, name='indices')
+
+    if ragged_tensor.is_ragged(indices):
+      return indices.with_values(gather(params, indices.values))
+
+    if not ragged_tensor.is_ragged(params):
+      return array_ops.gather(params, indices)
+
+    indices = ops.convert_to_tensor(indices)
+    if indices.shape.ndims is None:
+      raise ValueError('indices.shape.ndims must be known statically')
+
+    result = gen_ragged_array_ops.ragged_gather(
+        indices=indices,
+        params_dense_values=params.inner_values,
+        params_nested_splits=params.nested_row_splits,
+        OUTPUT_RAGGED_RANK=indices.shape.ndims + len(params.nested_row_splits) -
+        1)
+
+    # Compose the RaggedTensor from splits & values.
+    return ragged_factory_ops.from_nested_row_splits(
+        result.output_dense_values, result.output_nested_splits)
+
+
+#===============================================================================
+# ragged.batch_gather
+#===============================================================================
+def batch_gather(params, indices, name=None):
+  """Gathers slices from `params` according to `indices` with batch dims.
+
+  This operation is similar to `gather`, but it assumes that the leading `N`
+  dimensions of `indices` and `params` are batch dimensions, and performs a
+  gather within each batch.  In particular, when using this operation with `N`
+  batch dimensions `B1...BN`:
+
+  * `indices` has shape `[B1...BN, I]`
+  * `params` has shape `[B1...BN, P1...PM]`.
+  * `result` has shape `[B1...BN, I, P2...PM]`.
+  * `result[b1...bN, i, p2...pM] =
+    params[b1...bN, indices[b1...bN, i], p2...pM]`
+
+  Args:
+    params: A potentially ragged tensor with shape `[B1...BN, P1...PM]` (`N>=0`,
+      `M>0`).
+    indices: A potentially ragged tensor with shape `[B1...BN, I]` (`N>=0`).
+    name: A name for the operation (optional).
+
+  Returns:
+    A potentially ragged tensor with shape `[B1...BN, I, P2...PM]`.
+    `result.ragged_rank = max(indices.ragged_rank, params.ragged_rank)`.
+
+  #### Example:
+    ```python
+    >>> params = ragged.constant([['a', 'b', 'c'], ['d'], [], ['e']])
+    >>> indices = ragged.constant([[1, 2, 0], [], [], [0, 0]])
+    >>> ragged.batch_gather(params, indices)
+    [['b', 'c', 'a'], [], [], ['e', 'e']]
+    ```
+  """
+  if not (ragged_tensor.is_ragged(params) or ragged_tensor.is_ragged(indices)):
+    return array_ops.batch_gather(params, indices, name)
+
+  with ops.name_scope(name, 'RaggedBatchGather', [params, indices]):
+    params = ragged_factory_ops.convert_to_tensor_or_ragged_tensor(
+        params, name='params')
+    indices = ragged_factory_ops.convert_to_tensor_or_ragged_tensor(
+        indices, name='indices')
+    indices_ndims = indices.shape.ndims
+    if indices_ndims is None:
+      raise ValueError(
+          'batch_gather does not allow indices with unknown shape.')
+    if indices_ndims == 0:
+      raise ValueError('indices.rank must be at least 1.')
+
+    if ragged_tensor.is_ragged(indices):
+      # If the outermost ragged dimension is a batch dimension, recurse.
+      if indices_ndims > 2:
+        if not ragged_tensor.is_ragged(params):
+          raise ValueError('batch shape from indices does '
+                           'not match params shape')
+        checks = [check_ops.assert_equal(params.row_splits, indices.row_splits)]
+        with ops.control_dependencies(checks):
+          return ragged_factory_ops.from_row_splits(
+              batch_gather(params.values, indices.values), indices.row_splits)
+
+      # Otherwise, indices is a 2D ragged tensor with 1 ragged dimension.
+      else:
+        # Ensure that `params` is ragged and has at least 2 dimensions.
+        if not ragged_tensor.is_ragged(params):
+          if params.shape.ndims is not None and params.shape.ndims < 2:
+            raise ValueError('batch shape from indices does '
+                             'not match params shape')
+          params = ragged_conversion_ops.from_tensor(params, ragged_rank=1)
+
+        # Adjust indices from within-batch to global (in params.values), and
+        # then use ragged.gather to gather them.
+        num_indices = row_lengths(indices)
+        params_starts = row_starts(params)
+        adjustments = ragged_util.repeat(params_starts, num_indices, axis=0)
+        adjusted_index_values = math_ops.to_int64(indices.values) + adjustments
+        return ragged_factory_ops.from_row_splits(
+            gather(params.values, adjusted_index_values), indices.row_splits)
+
+    else:  # params is a RaggedTensor and indices is a Tensor.
+      if indices_ndims == 1:
+        return gather(params, indices)
+      elif indices_ndims == 2:
+        # Adjust indices from batch-local to global (in params.values)
+        adjustments = array_ops.expand_dims(row_starts(params), 1)
+        adjusted_indices = math_ops.to_int64(indices) + adjustments
+        return gather(params.values, adjusted_indices)
+      else:
+        raise ValueError(
+            'batch shape from indices does not match params shape')
+
+
+#===============================================================================
+# ragged.gather_nd
+#===============================================================================
+def gather_nd(params, indices, name=None):
+  """Gather slices from `params` using `n`-dimensional indices.
+
+  This operation is similar to `gather`, but it uses the innermost dimension
+  of `indices` to define a slice into `params`.  In particular, if:
+
+  * `indices` has shape `[A1...AN, I]`
+  * `params` has shape `[B1...BM]`
+
+  Then:
+
+  * `result` has shape `[A1...AN, B_{I+1}...BM]`.
+  * `result[a1...aN] = params[indices[a1...aN, :]]`
+
+  Args:
+    params: A potentially ragged tensor with shape `[A1...AN, I]`.
+    indices: A potentially ragged tensor with shape `[B1...BM]`.
+    name: A name for the operation (optional).
+
+  Returns:
+    A potentially ragged tensor with shape `[A1...AN, B_{I+1}...BM]`.
+
+  #### Examples:
+    ```python
+    >>> params = tf.ragged.constant_value(
+    ...     [ [ ['000', '001'], ['010'              ]          ],
+    ...       [ ['100'       ], ['110', '111', '112'], ['120'] ],
+    ...       [ [            ], ['210'              ]          ] ])
+
+    >>> # Gather 2D slices from a 3D tensor
+    >>> ragged.gather_nd(params, [[2], [0]])
+    [ [ [            ], ['210'] ]
+      [ ['000', '001'], ['010'] ] ]
+
+    >>> # Gather 1D slices from a 3D tensor
+    >>> ragged.gather_nd(params, [[2, 1], [0, 0]])
+    [['210'], ['000', '001']]
+
+    >>> # Gather scalars from a 3D tensor
+    >>> ragged.gather_nd(params, [[0, 0, 1], [1, 1, 2]])
+    ['001', '112']
+    ```
+  """
+  if not (ragged_tensor.is_ragged(params) or ragged_tensor.is_ragged(indices)):
+    return array_ops.gather_nd(params, indices, name)
+
+  with ops.name_scope(name, 'RaggedGatherNd', [params, indices]):
+
+    params = ragged_factory_ops.convert_to_tensor_or_ragged_tensor(
+        params, name='params')
+    indices = ragged_factory_ops.convert_to_tensor_or_ragged_tensor(
+        indices, name='indices')
+    indices_shape = indices.shape
+    indices_ndims = indices_shape.ndims
+    if indices_ndims is None:
+      raise ValueError('indices.rank be statically known.')
+    if indices_ndims == 0:
+      raise ValueError('indices.rank must be at least 1.')
+    if (ragged_tensor.is_ragged(indices) and
+        indices_ndims == indices.ragged_rank + 1):
+      raise ValueError('The innermost dimension of indices may not be ragged')
+
+    # `index_size` is the "n" in "gather_nd" -- i.e., the number of dimensions
+    # that each index slices into.
+    index_size = indices_shape[-1].value
+    if index_size is None:
+      raise ValueError('indices.shape[-1] must be statically known.')
+
+    # If `indices` has more than 2 dimensions, then recurse.  If `indices` is
+    # dense, then we convert it to ragged before recursing, and then convert
+    # the result back to `dense` if appropriate.
+    if indices_ndims > 2:
+      indices_is_dense = not ragged_tensor.is_ragged(indices)
+      if indices_is_dense:
+        indices = ragged_conversion_ops.from_tensor(
+            indices, ragged_rank=indices_ndims - 2)
+      result = indices.with_inner_values(
+          gather_nd(params, indices.inner_values))
+      if (indices_is_dense and ragged_tensor.is_ragged(result) and
+          result.ragged_rank == indices_ndims - 2):
+        result = ragged_conversion_ops.to_tensor(result)
+      return result
+
+    # indices_ndims <= 2, and the innermost dimension of indices may not be
+    # ragged, so `indices` must not be ragged.
+    assert not ragged_tensor.is_ragged(indices)
+    assert ragged_tensor.is_ragged(params)
+
+    # Handle corner case: An empty index tuple selects the entire `params`
+    # value.  So if `index_size` is zero, then tile `params`.
+    if index_size == 0:
+      params_ndims = params.ragged_rank + array_ops.rank(params.inner_values)
+      for dim in range(indices_ndims - 1):
+        params = expand_dims(params, axis=0)
+      multiples = array_ops.concat([
+          array_ops.shape(indices)[:-1],
+          array_ops.ones([params_ndims], dtypes.int32)
+      ],
+                                   axis=0)
+      return tile(params, multiples)
+
+    # When index_size=1, we can just flatten the index tuples and use gather.
+    elif index_size == 1:
+      flattened_index_tuples = array_ops.reshape(indices, [-1])
+      return gather(params, flattened_index_tuples)
+
+    # Otherwise, params is a RaggedTensor, and indices is a 1D or 2D Tensor.
+    # Flatten both the index tuples and the params, such that the flattened
+    # index tuples point to the correct values in the flattened params; and
+    # then use ragged.gather on the flattened index tuples & params.
+    else:
+      indices = math_ops.to_int64(indices)
+
+      # Flatten the outermost 2 dimensions of the index tuples & params.
+      flattened_index_tuples = array_ops.gather(params.row_splits,
+                                                indices[..., 0])
+      flattened_index_tuples += indices[..., 1]
+      flattened_params = params.values
+
+      # Flatten any remaining dimensions.
+      for dim in range(2, index_size):
+        if not ragged_tensor.is_ragged(flattened_params):
+          flattened_index_tuples = array_ops.expand_dims(
+              flattened_index_tuples, axis=1)
+          flattened_index_tuples = array_ops.concat(
+              [flattened_index_tuples, indices[..., dim:]], axis=1)
+          return array_ops.gather_nd(flattened_params, flattened_index_tuples)
+
+        flattened_index_tuples = array_ops.gather(
+            row_starts(flattened_params), flattened_index_tuples)
+        flattened_index_tuples += indices[..., dim]
+        flattened_params = flattened_params.values
+
+      # Gather using the flattened index tuples and params.
+      return gather(flattened_params, flattened_index_tuples)
+
+
+#===============================================================================
+# Masking
+#===============================================================================
+def boolean_mask(data, mask, keepdims=False, name=None):
+  """Applies a boolean mask to `data`.
+
+  Returns a potentially ragged tensor that is formed by retaining the elements
+  in `data` where the corresponding value in `mask` is `True`.
+
+  If `keepdims` is true then outer dimensions (corresponding to the `mask`
+  dimensions) are preserved, and:
+
+  * `output[a1...aA, i, b1...bB] = data[a1...aA, j, b1...bB]`
+
+     Where `j` is the `i`th `True` entry of `mask[a1...aA]`.
+
+  If `keepdims` is false, then the outer dimensions are collapsed (similar to
+  the behavior of `tf.boolean_mask`), and:
+
+  * `output[i, b1...bB] = data[a1...aA, b1...bB]`
+
+     Where `(a1...aA)` is the `i`th `True` entry of `mask`
+     (in row-major order).
+
+  Args:
+    data: A potentially ragged tensor.
+    mask: A potentially ragged boolean tensor.  `mask`'s shape must be a prefix
+      of `data`'s shape.  `rank(mask)` must be known statically.
+    keepdims: Whether to preserve the outer dimensions (`keepdims=True`) or
+      flatten them (`keepdims=False`).
+    name: A name prefix for the returned tensor (optional).
+
+  Returns:
+    A potentially ragged tensor that is formed by retaining the elements in
+    `data` where the corresponding value in `mask` is `True`.
+
+    If `keepdims` is false:
+
+    * `rank(output) = rank(data) - rank(mask) + 1`.
+    * `output.ragged_rank = max(data.ragged_rank - rank(mask) + 1, 0)`.
+
+    If `keepdims` is true:
+
+    * `rank(output) = rank(data)`.
+    * `output.ragged_rank = max(data.ragged_rank, rank(mask) - 1)`.
+
+  Raises:
+    ValueError: if `rank(mask)` is not known statically; or if `mask.shape` is
+      not a prefix of `data.shape`.
+
+  #### Examples:
+    ```python
+    >>> # Aliases for True & False so data and mask line up.
+    >>> T, F = (True, False)
+
+    >>> tf.ragged.boolean_mask(  # Mask a 2D Tensor.  Flatten outer dims.
+    ...     data=[[1, 2, 3], [4, 5, 6], [7, 8, 9]],
+    ...     mask=[[T, F, T], [F, F, F], [T, F, F]],
+    ...     keepdims=False).tolist()
+    [1, 3, 7]
+
+    >>> tf.ragged.boolean_mask(  # Mask a 2D Tensor.  Preserve outer dims.
+    ...     data=[[1, 2, 3], [4, 5, 6], [7, 8, 9]],
+    ...     mask=[[T, F, T], [F, F, F], [T, F, F]],
+    ...     keepdims=True).tolist()
+    [[1, 3], [], [7]]
+
+    >>> tf.ragged.boolean_mask(  # Mask a 2D RaggedTensor.  Flatten outer dims.
+    ...     tf.ragged.constant([[1, 2, 3], [4], [5, 6]]),
+    ...     tf.ragged.constant([[F, F, T], [F], [T, T]]),
+    ...     keepdims=False).tolist()
+    [3, 5, 6]
+
+    >>> tf.ragged.boolean_mask(  # Mask a 2D RaggedTensor.  Preserve outer dims.
+    ...     tf.ragged.constant([[1, 2, 3], [4], [5, 6]]),
+    ...     tf.ragged.constant([[F, F, T], [F], [T, T]]),
+    ...     keepdims=True).tolist()
+    [[3], [], [5, 6]]
+
+    >>> tf.ragged.boolean_mask(  # Mask rows of a 2D RaggedTensor.
+    ...     tf.ragged.constant([[1, 2, 3], [4], [5, 6]]),
+    ...     tf.ragged.constant([True, False, True]),
+    ...     keepdims=True).tolist()
+    [[1, 2, 3], [5, 6]]
+    ```
+  """
+  with ops.name_scope(name, 'RaggedMask', [data, mask]):
+    # Convert inputs to tensors.
+    data = ragged_factory_ops.convert_to_tensor_or_ragged_tensor(
+        data, name='data')
+    mask = ragged_factory_ops.convert_to_tensor_or_ragged_tensor(
+        mask, dtypes.bool, name='mask')
+
+    # Get static rank of mask.
+    if mask.shape.ndims is None:
+      raise ValueError('mask.shape.ndims must be kown statically.')
+    elif mask.shape.ndims == 0:
+      raise ValueError('mask cannot be scalar.')
+
+    # If mask is ragged, then recurse with a non-ragged mask.
+    if ragged_tensor.is_ragged(mask):
+      if not ragged_tensor.is_ragged(data):
+        data = ragged_conversion_ops.from_tensor(
+            data, ragged_rank=mask.ragged_rank)
+      # Check that mask.nested_row_splits is a prefix of
+      # data.nested_row_splits.
+      splits_list = [
+          mask.nested_row_splits, data.nested_row_splits[:mask.ragged_rank]
+      ]
+      with ops.control_dependencies(
+          ragged_util.assert_splits_match(splits_list)):
+        # Strip off ragged `splits` until `mask` is non-ragged.  Keep the splits
+        # that we strip off in `splits`, so we can add them back on after
+        # we recursively mask the non-ragged data.
+        splits = []
+        while ragged_tensor.is_ragged(mask):
+          if mask.shape.ndims > 2:
+            splits.append(mask.row_splits)
+          else:
+            # Count the number of True mask values in each row to find the
+            # lengths of the filtered rows; then convert to splits.
+            int_mask = ragged_functional_ops.map_inner_values(
+                math_ops.cast, mask, dtype=dtypes.int64)
+            masked_row_lengths = ragged_math_ops.reduce_sum(int_mask, axis=1)
+            splits.append(_lengths_to_splits(masked_row_lengths))
+          mask = mask.values
+          data = data.values
+
+        # Recursively apply the nested non-ragged mask to the nested data.
+        masked_values = boolean_mask(data, mask, keepdims)
+
+        # Add the ragged `splits` back to the result.
+        if keepdims:
+          masked_values = ragged_factory_ops.from_nested_row_splits(
+              masked_values, splits)
+
+        return masked_values
+
+    # If mask is non-ragged and has rank 1, and data is ragged, then build a
+    # ragged tensor with the indicated rows.
+    elif ragged_tensor.is_ragged(data) and mask.shape.ndims == 1:
+      # Get the masked splits: first get the length of each row, then filter
+      # out the rows that we are deleting, and convert that filtered set of
+      # masks back to a splits tensor.
+      lengths = row_lengths(data)
+      masked_lengths = array_ops.boolean_mask(lengths, mask)
+      masked_splits = _lengths_to_splits(masked_lengths)
+
+      # Get the masked values: first get row ids corresponding to each
+      # value, then use tf.gather to build a boolean mask that's false for
+      # values that come from rows that we are deleting, and use that mask to
+      # construct the masked values tensor.
+      segment_ids = segment_id_ops.row_splits_to_segment_ids(data.row_splits)
+      segment_mask = array_ops.gather(mask, segment_ids)
+      masked_values = boolean_mask(data.values, segment_mask, keepdims=False)
+
+      return ragged_factory_ops.from_row_splits(masked_values, masked_splits)
+
+    # If mask is non-ragged and has rank>1, then convert it to be ragged,
+    # with a ragged rank matching data.
+    if ragged_tensor.is_ragged(data):
+      mask = ragged_conversion_ops.from_tensor(
+          mask, ragged_rank=min(data.ragged_rank, mask.shape.ndims - 1))
+      return boolean_mask(data, mask, keepdims)
+
+    # Otherwise, data and mask are both `Tensor`s.
+    else:
+      # Apply `boolean_mask` to get the masked values.
+      masked_values = array_ops.boolean_mask(data, mask)
+
+      if mask.shape.ndims >= 2 and keepdims:
+        # Add the innermost ragged dimension.  For each innermost cell, get the
+        # number of values it contains.  Then flatten that to get a list of
+        # cell lengths, and convert it to splits.  Finally, combine the splits
+        # and values to get the innermost ragged tensor.
+        masked_lengths = math_ops.count_nonzero(mask, axis=-1)
+        flattened_masked_lengths = array_ops.reshape(masked_lengths, [-1])
+        masked_values = ragged_factory_ops.from_row_lengths(
+            masked_values, flattened_masked_lengths)
+
+        # Wrap remaining ragged dimensions.
+        if mask.shape.ndims > 2 and keepdims:
+          mask_shape = array_ops.shape(mask, out_type=dtypes.int64)
+          split_size = math_ops.cumprod(mask_shape) + 1
+          for dim in range(mask.shape.ndims - 3, -1, -1):
+            elt_size = mask_shape[dim + 1]
+            masked_splits = math_ops.range(split_size[dim]) * elt_size
+            masked_values = ragged_factory_ops.from_row_splits(
+                masked_values, masked_splits)
+
+      return masked_values
+
+
+#===============================================================================
+# Concatenation and Stacking
+#===============================================================================
+def concat(rt_inputs, axis, name=None):
+  """Concatenates potentially ragged tensors along one dimension.
+
+  Given a list of tensors with the same rank `K` (`K >= axis`), returns a
+  rank-`K` `RaggedTensor` `result` such that `result[i0...iaxis]` is the
+  concatenation of `[rt[i0...iaxis] for rt in rt_inputs]`.
+
+  Args:
+    rt_inputs: A list of potentially ragged tensors.  May not be empty. All
+      `rt_inputs` must have the same rank and the same dtype; but unlike
+      `tf.concat`, they can have arbitrary shapes.
+    axis: A python integer, indicating the dimension along which to concatenate.
+      (Note: Unlike `tf.concat`, the `axis` parameter must be statically known.)
+        Negative values are supported only if the rank of at least one
+        `rt_inputs` value is statically known.
+    name: A name prefix for the returned tensor (optional).
+
+  Returns:
+    A `RaggedTensor` with rank `K`.
+    `result.ragged_rank=max(axis, max(rt.ragged_rank for rt in rt_inputs]))`.
+
+  Raises:
+    ValueError: If `rt_inputs` is empty, if `axis` is out of bounds or if
+      the input tensors have different ranks.
+
+  #### Example:
+    ```python
+    >>> t1 = ragged.constant([[1, 2], [3, 4, 5]])
+    >>> t2 = ragged.constant([[6], [7, 8, 9]])
+    >>> ragged.concat([t1, t2], axis=0)
+    [[1, 2], [3, 4, 5], [6], [7, 8, 9]]
+    >>> ragged.concat([t1, t2], axis=1)
+    [[1, 2, 6], [3, 4, 5, 7, 8, 9]]
+    ```
+  """
+  if not isinstance(rt_inputs, (list, tuple)):
+    rt_inputs = [rt_inputs]
+  with ops.name_scope(name, 'RaggedConcat', rt_inputs):
+    return _ragged_stack_concat_helper(rt_inputs, axis, stack_values=False)
+
+
+def stack(rt_inputs, axis, name=None):
+  """Stacks potentially ragged tensors along one dimension.
+
+  Given a list of tensors with the same rank `K` (`K >= axis`), returns a
+  rank-`K+1` `RaggedTensor` `result` such that `result[i0...iaxis]` is the
+  list `[rt[i0...iaxis] for rt in rt_inputs]`.
+
+  Args:
+    rt_inputs: A list of potentially ragged tensors.  May not be empty. All
+      `rt_inputs` must have the same rank and the same dtype; but unlike
+      `tf.concat`, they can have arbitrary shapes.
+    axis: A python integer, indicating the dimension along which to stack.
+      (Note: Unlike `tf.stack`, the `axis` parameter must be statically known.)
+        Negative values are supported only if the rank of at least one
+        `rt_inputs` value is statically known.
+    name: A name prefix for the returned tensor (optional).
+
+  Returns:
+    A `RaggedTensor` with rank `K+1`.
+    `result.ragged_rank=max(axis, max(rt.ragged_rank for rt in rt_inputs]))`.
+
+  Raises:
+    ValueError: If `rt_inputs` is empty, if `axis` is out of bounds or if
+      the input tensors have different ranks.
+
+  #### Example:
+    ```python
+    >>> t1 = ragged.constant([[1, 2], [3, 4, 5]])
+    >>> t2 = ragged.constant([[6], [7, 8, 9]])
+    >>> ragged.stack([t1, t2], axis=0)
+    [[[1, 2], [3, 4, 5]], [[6], [7, 9, 0]]]
+    >>> ragged.stack([t1, t2], axis=1)
+    [[[1, 2], [6]], [[3, 4, 5], [7, 8, 9]]]
+    ```
+  """
+  if not isinstance(rt_inputs, (list, tuple)):
+    rt_inputs = [rt_inputs]
+  with ops.name_scope(name, 'RaggedConcat', rt_inputs):
+    return _ragged_stack_concat_helper(rt_inputs, axis, stack_values=True)
+
+
+def _ragged_stack_concat_helper(rt_inputs, axis, stack_values):
+  """Helper function to concatenate or stack ragged tensors.
+
+  Args:
+    rt_inputs: A list of RaggedTensors or Tensors to combine.
+    axis: The axis along which to concatenate or stack.
+    stack_values: A boolean -- if true, then stack values; otherwise,
+      concatenate them.
+
+  Returns:
+    A RaggedTensor.
+  Raises:
+    ValueError: If rt_inputs is empty, or if axis is out of range.
+  """
+  # Validate parameters.
+  if not rt_inputs:
+    raise ValueError('rt_inputs may not be empty.')
+
+  # Convert input tensors.
+  rt_inputs = [
+      ragged_factory_ops.convert_to_tensor_or_ragged_tensor(
+          rt_input, name='rt_input') for rt_input in rt_inputs
+  ]
+
+  # Special case: if there's only one input, then return it as-is.
+  if len(rt_inputs) == 1:
+    if stack_values:
+      return expand_dims(rt_inputs[0], axis=0)
+    else:
+      return rt_inputs[0]
+
+  # Check the rank (number of dimensions) of the input tensors.
+  ndims = None
+  for rt in rt_inputs:
+    if ndims is None:
+      ndims = rt.shape.ndims
+    else:
+      rt.shape.assert_has_rank(ndims)
+
+  out_ndims = ndims if (ndims is None or not stack_values) else ndims + 1
+  axis = ragged_util.get_positive_axis(axis, out_ndims)
+
+  # If all the inputs are Tensors, and we're combining the final dimension,
+  # then we can delegate to the tf.stack/tf.concat operation, and return a
+  # Tensor.
+  if all(not ragged_tensor.is_ragged(rt) for rt in rt_inputs):
+    if ndims is not None and (axis == out_ndims - 1 or axis == ndims - 1):
+      if stack_values:
+        return array_ops.stack(rt_inputs, axis)
+      else:
+        return array_ops.concat(rt_inputs, axis)
+
+  # Convert any Tensor inputs to RaggedTensors.  This makes it
+  # possible to concatenate Tensors and RaggedTensors together.
+  for i in range(len(rt_inputs)):
+    if not ragged_tensor.is_ragged(rt_inputs[i]):
+      rt_inputs[i] = ragged_conversion_ops.from_tensor(
+          rt_inputs[i], ragged_rank=1)
+
+  # Convert the input tensors to all have the same ragged_rank.
+  ragged_rank = max(max(rt.ragged_rank for rt in rt_inputs), 1)
+  rt_inputs = [_increase_ragged_rank_to(rt, ragged_rank) for rt in rt_inputs]
+
+  if axis == 0:
+    return _ragged_stack_concat_axis_0(rt_inputs, stack_values)
+  elif axis == 1:
+    return _ragged_stack_concat_axis_1(rt_inputs, stack_values)
+  else:  # axis > 1: recurse.
+    values = [rt.values for rt in rt_inputs]
+    splits = [[rt_input.row_splits] for rt_input in rt_inputs]
+    with ops.control_dependencies(ragged_util.assert_splits_match(splits)):
+      return ragged_factory_ops.from_row_splits(
+          _ragged_stack_concat_helper(values, axis - 1, stack_values),
+          splits[0][0])
+
+
+def _ragged_stack_concat_axis_0(rt_inputs, stack_values):
+  """Helper function to concatenate or stack ragged tensors along axis 0.
+
+  Args:
+    rt_inputs: A list of RaggedTensors, all with the same rank and ragged_rank.
+    stack_values: Boolean.  If true, then stack values; otherwise, concatenate
+      them.
+
+  Returns:
+    A RaggedTensor.
+  """
+  # Concatenate the inner values together.
+  inner_values = [rt.inner_values for rt in rt_inputs]
+  concatenated_inner_values = array_ops.concat(inner_values, axis=0)
+
+  # Concatenate the splits together for each ragged dimension (adjusting
+  # split offsets as necessary).
+  nested_splits = [rt.nested_row_splits for rt in rt_inputs]
+  ragged_rank = rt_inputs[0].ragged_rank
+  concatenated_nested_splits = [
+      _concat_ragged_splits([ns[dim]
+                             for ns in nested_splits])
+      for dim in range(ragged_rank)
+  ]
+
+  # If we are performing a stack operation, then add another splits.
+  if stack_values:
+    stack_lengths = array_ops.stack([nrows(rt) for rt in rt_inputs])
+    stack_splits = _lengths_to_splits(stack_lengths)
+    concatenated_nested_splits.insert(0, stack_splits)
+
+  return ragged_factory_ops.from_nested_row_splits(concatenated_inner_values,
+                                                   concatenated_nested_splits)
+
+
+def _ragged_stack_concat_axis_1(rt_inputs, stack_values):
+  """Helper function to concatenate or stack ragged tensors along axis 1.
+
+  Args:
+    rt_inputs: A list of RaggedTensors, all with the same rank and ragged_rank.
+    stack_values: Boolean.  If true, then stack values; otherwise, concatenate
+      them.
+
+  Returns:
+    A RaggedTensor.
+  """
+  num_inputs = len(rt_inputs)
+
+  rt_nrows = nrows(rt_inputs[0])
+  nrows_msg = 'Input tensors have incompatible shapes.'
+  nrows_checks = [
+      check_ops.assert_equal(nrows(rt), rt_nrows, message=nrows_msg)
+      for rt in rt_inputs[1:]
+  ]
+
+  with ops.control_dependencies(nrows_checks):
+    # Concatentate the inputs together to put them in a single ragged tensor.
+    concatenated_rt = _ragged_stack_concat_axis_0(rt_inputs, stack_values=False)
+
+    # Use ragged.gather to permute the rows of concatenated_rt.  In particular,
+    #   permuted_rt = [rt_inputs[0][0], ..., rt_inputs[N][0],
+    #                  rt_inputs[0][1], ..., rt_inputs[N][1],
+    #                      ...,
+    #                  rt_inputs[0][M], ..., rt_input[N][M]]
+    # where `N=num_inputs-1` and `M=rt_nrows-1`.
+    row_indices = math_ops.range(rt_nrows * num_inputs)
+    row_index_matrix = array_ops.reshape(row_indices, [num_inputs, -1])
+    transposed_row_index_matrix = array_ops.transpose(row_index_matrix)
+    row_permutation = array_ops.reshape(transposed_row_index_matrix, [-1])
+    permuted_rt = gather(concatenated_rt, row_permutation)
+
+    if stack_values:
+      # Add a new splits tensor to group together the values.
+      stack_splits = math_ops.range(0, rt_nrows * num_inputs + 1, num_inputs)
+      _copy_row_shape(rt_inputs, stack_splits)
+      return ragged_factory_ops.from_row_splits(permuted_rt, stack_splits)
+    else:
+      # Merge together adjacent rows by dropping the row-split indices that
+      # separate them.
+      concat_splits = permuted_rt.row_splits[::num_inputs]
+      _copy_row_shape(rt_inputs, concat_splits)
+      return ragged_factory_ops.from_row_splits(permuted_rt.values,
+                                                concat_splits)
+
+
+def _copy_row_shape(rt_inputs, splits):
+  """Sets splits.shape to [rt[shape[0]+1] for each rt in rt_inputs."""
+  for rt in rt_inputs:
+    if rt.shape[0] is not None:
+      splits.set_shape(tensor_shape.TensorShape(rt.shape[0] + 1))
+
+
+#===============================================================================
+# Tiling
+#===============================================================================
+def tile(rt_input, multiples, name=None):
+  """Constructs a `RaggedTensor` by tiling a given `RaggedTensor`.
+
+  The values of `rt_input` are replicated `multiples[i]` times along the
+  `i`th dimension (for each dimension `i`).  For every dimension `axis` in
+  `rt_input`, the length of each output element in that dimension is the
+  length of corresponding input element multiplied by `multiples[axis]`.
+
+  Args:
+    rt_input: A `RaggedTensor`.
+    multiples: A 1-D integer `Tensor`.  Length must be the same as the number of
+      dimensions in `rt_input`.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `RaggedTensor` with the same type, rank, and ragged_rank as `rt_input`.
+
+  #### Example:
+    ```python
+    >>> rt = ragged.constant([[1, 2], [3]])
+    >>> ragged.tile(rt, [3, 2]).eval().tolist()
+    [[1, 2, 1, 2], [3, 3], [1, 2, 1, 2], [3, 3], [1, 2, 1, 2], [3, 3]]
+    ```
+  """
+  with ops.name_scope(name, 'RaggedTile', [rt_input, multiples]):
+    rt_input = ragged_factory_ops.convert_to_tensor_or_ragged_tensor(
+        rt_input, name='rt_input')
+    multiples = ragged_util.convert_to_int_tensor(
+        multiples, name='multiples', dtype=dtypes.int64)
+    multiples.shape.assert_has_rank(1)
+    if not ragged_tensor.is_ragged(rt_input):
+      return array_ops.tile(rt_input, multiples, name)
+
+    # If the constant value of `multiples` is available, then we can use it
+    # to skip tiling dimensions where `multiples=1`.
+    const_multiples = tensor_util.constant_value(multiples)
+
+    return ragged_factory_ops.from_nested_row_splits(
+        _tile_ragged_values(rt_input, multiples, const_multiples),
+        _tile_ragged_splits(rt_input, multiples, const_multiples))
+
+
+def _tile_ragged_values(rt_input, multiples, const_multiples=None):
+  """Builds inner_values tensor for a tiled `RaggedTensor`.
+
+  Returns a tensor that repeats the values in
+  `rt_input.inner_values` in the
+  appropriate pattern to construct a `RaggedTensor` that tiles `rt_input` as
+  specified by `multiples`.
+
+  Args:
+    rt_input: The `RaggedTensor` whose values should be repeated.
+    multiples: A 1-D integer `tensor`, indicating how many times each dimension
+      should be repeated.
+    const_multiples: Optional constant value for multiples.  Used to skip tiling
+      dimensions where `multiples=1`.
+
+  Returns:
+    A `Tensor` with the same type and rank as `rt_input.inner_values`.
+
+  #### Example:
+    ```python
+    >>> rt = ragged.constant([[1, 2], [3]])
+    >>> _tile_ragged_values(rt, [3, 2]).eval().tolist()
+    [1, 2, 1, 2, 3, 3, 1, 2, 1, 2, 3, 3, 1, 2, 1, 2, 3, 3]
+    ```
+  """
+  ragged_rank = rt_input.ragged_rank
+  nested_splits = rt_input.nested_row_splits
+
+  # Pointers to the values in `rt_input.inner_values`.
+  inner_value_ids = math_ops.range(nested_splits[-1][-1])
+
+  # For each ragged dimension (working from the innermost to outermost),
+  # expand `inner_value_ids` as necessary to tile that dimension.
+  prev_splits = None
+  for axis in range(ragged_rank, 0, -1):
+    # Ragged splits for this dimension.
+    splits = nested_splits[axis - 1]
+
+    # Adjust splits so they point into `inner_value_ids` (instead of just
+    # pointing into the next dimension's values).
+    if prev_splits is not None:  # Not the first pass through the loop.
+      splits = array_ops.gather(prev_splits * multiples[axis + 1], splits)
+
+    # Repeat each element in this ragged dimension `multiples[axis]` times.
+    if const_multiples is None or const_multiples[axis] != 1:
+      inner_value_ids = _repeat_ranges(inner_value_ids, splits, multiples[axis])
+
+    prev_splits = splits
+
+  # Gather the tiled inner values.
+  ragged_tiled_values = array_ops.gather(rt_input.inner_values, inner_value_ids)
+
+  # Tile the inner_values for the uniform dimensions (i.e., for `axis=0` plus
+  # `axis=range(ragged_rank, rank)`).
+  inner_repeats = array_ops.concat([multiples[:1], multiples[ragged_rank + 1:]],
+                                   axis=0)
+  return array_ops.tile(ragged_tiled_values, inner_repeats)
+
+
+def _tile_ragged_splits(rt_input, multiples, const_multiples=None):
+  """Builds nested_split tensors for a tiled `RaggedTensor`.
+
+  Returns a list of split tensors that can be used to construct the
+  `RaggedTensor` that tiles `rt_input` as specified by `multiples`.
+
+  Args:
+    rt_input: The `RaggedTensor` that is being tiled.
+    multiples: A 1-D integer `tensor`, indicating how many times each dimension
+      should be repeated.
+    const_multiples: Optional constant value for multiples.  Used to skip tiling
+      dimensions where `multiples=1`.
+
+  Returns:
+    A list of 1-D `int64` `Tensor`s (one for each ragged dimension in
+    `rt_input`).
+
+  #### Example:
+    ```python
+    >>> rt = ragged.constant([[1, 2], [3]])
+    >>> _tile_ragged_splits(rt, [3, 2]).eval().tolist()
+    [0, 4, 6, 10, 12, 16, 18]
+    ```
+  """
+  ragged_rank = rt_input.ragged_rank
+  nested_splits = rt_input.nested_row_splits
+
+  # For each ragged dimension: nested_splits[axis] -> result_splits[axis].
+  result_splits = []
+  for axis in range(ragged_rank):
+    # Get the length of each row for the input tensor for this dimension.
+    input_lengths = nested_splits[axis][1:] - nested_splits[axis][:-1]
+
+    # Multiply those lengths by the `multiples` of dimension axis+1, since
+    # each value will be repeated that number of times.
+    output_lengths = input_lengths * multiples[axis + 1]
+
+    # Repeat ranges of the row lengths as necessary for them to be tiled in
+    # each ragged dimension `d < axis`.  (Start with dimension d=axis-1, and
+    # work our way up to dimension d=0.)
+    repeats = 1
+    for d in range(axis - 1, -1, -1):
+      if const_multiples is None or const_multiples[d + 1] != 1:
+        splits = nested_splits[d] * repeats
+        output_lengths = _repeat_ranges(output_lengths, splits,
+                                        multiples[d + 1])
+      repeats *= multiples[d + 1]
+
+    # Tile splits for the outermost (uniform) dimension.
+    output_lengths = array_ops.tile(output_lengths, multiples[:1])
+
+    # Convert to splits.
+    result_splits.append(_lengths_to_splits(output_lengths))
+
+  return result_splits
+
+
+#===============================================================================
+# Reshaping
+#===============================================================================
+
+
+def expand_dims(rt_input, axis, name=None):
+  """Inserts a dimension with shape 1 into a potentially ragged tensor's shape.
+
+  Given a potentially ragged tenor `rt_input`, this operation inserts a
+  dimension with size 1 at the dimension `axis` of `rt_input`'s shape.
+
+  * If `rt_input` is a `Tensor`, then this is equivalent to
+    `tf.expand_dims`.
+  * If `rt_input` is ragged, and `axis=0`, then the new dimension will be
+    uniform; but the previously outermost dimension will become ragged.
+  * If `rt_input` is ragged, and `0 < axis < rt_input.ragged_rank`, then the
+    new dimension will be ragged.
+  * If `rt_input` is ragged, and axis >= rt_input.ragged_rank`, then the new
+    dimension will be uniform.
+
+  The following table gives some examples showing how `ragged.expand_dims`
+  impacts the shapes of different input tensors.  Ragged dimensions are
+  indicated by enclosing them in parentheses.
+
+  rt_input.shape          | axis | result.shape
+  ----------------------- | ---- | -----------------------------
+  `[D1, D2]`              |  `0` | `[1, D1, D2]`
+  `[D1, D2]`              |  `1` | `[D1, 1, D2]`
+  `[D1, D2]`              |  `2` | `[D1, D2, 1]`
+  `[D1, (D2), (D3), D4]`  |  `0` | `[1, (D1), (D2), (D3), D4]`
+  `[D1, (D2), (D3), D4]`  |  `1` | `[D1, (1), (D2), (D3), D4]`
+  `[D1, (D2), (D3), D4]`  |  `2` | `[D1, (D2), (1), (D3), D4]`
+  `[D1, (D2), (D3), D4]`  |  `3` | `[D1, (D2), (D3), 1, D4]`
+  `[D1, (D2), (D3), D4]`  |  `4` | `[D1, (D2), (D3), D4, 1]`
+
+  Args:
+    rt_input: The potentially tensor that should be expanded with a new
+      dimension.
+    axis: An integer constant indicating where the new dimension should be
+      inserted.
+    name: A name for the operation (optional).
+
+  Returns:
+    A tensor with the same values as `rt_input`, with an added dimension of
+    size 1 at `axis`.
+
+  #### Examples:
+    ```python
+    >>> rt = ragged.constant([[1, 2], [3]])
+    >>> print rt.shape
+    TensorShape([2, None])
+
+    >>> expanded = ragged.expand_dims(rt, axis=0)
+    >>> print(expanded.shape, expanded.eval().tolist())
+    TensorShape([1, None, None]) [[[1, 2], [3]]]
+
+    >>> expanded = ragged.expand_dims(rt, axis=1)
+    >>> print(expanded.shape, expanded.eval().tolist())
+    TensorShape([2, None, None]) [[[1, 2]], [[3]]]
+
+    >>> expanded = ragged.expand_dims(rt, axis=2)
+    >>> print(expanded.shape, expanded.eval().tolist())
+    TensorShape([2, None, 1]) [[[1], [2]], [[3]]]
+    ```
+  """
+  with ops.name_scope(name, 'RaggedExpandDims', [rt_input]):
+    rt_input = ragged_factory_ops.convert_to_tensor_or_ragged_tensor(
+        rt_input, name='rt_input')
+
+    if not ragged_tensor.is_ragged(rt_input):
+      return array_ops.expand_dims(rt_input, axis)
+
+    ndims = None if rt_input.shape.ndims is None else rt_input.shape.ndims + 1
+    axis = ragged_util.get_positive_axis(axis, ndims)
+    if axis == 0:
+      values = rt_input
+      splits = array_ops.stack([0, nrows(rt_input)])
+    elif axis == 1:
+      values = rt_input
+      splits = math_ops.range(nrows(rt_input) + 1)
+    else:
+      values = expand_dims(rt_input.values, axis - 1)
+      splits = rt_input.row_splits
+
+    return ragged_factory_ops.from_row_splits(values, splits)
+
+
+#===============================================================================
+# ragged.where
+#===============================================================================
+def where(condition, x=None, y=None, name=None):
+  """Return the elements, either from `x` or `y`, depending on the `condition`.
+
+  : If both `x` and `y` are `None`:
+    Returns the coordinates of true elements of `condition`. The coordinates
+    are returned in a 2-D tensor with shape
+    `[num_true_values, dim_size(condition)]`, where `result[i]` is the
+    coordinates of the `i`th true value (in row-major order).
+
+  : If both `x` and `y` are non-`None`:
+    Returns a tensor formed by selecting values from `x` where condition is
+    true, and from `y` when condition is false.  In particular:
+
+    : If `condition`, `x`, and `y` all have the same shape:
+
+      * `result[i1...iN] = x[i1...iN]` if `condition[i1...iN]` is true.
+      * `result[i1...iN] = y[i1...iN]` if `condition[i1...iN]` is false.
+
+    : Otherwise:
+
+      * `condition` must be a vector.
+      * `x` and `y` must have the same number of dimensions.
+      * The outermost dimensions of `condition`, `x`, and `y` must all have the
+        same size.
+      * `result[i] = x[i]` if `condition[i]` is true.
+      * `result[i] = y[i]` if `condition[i]` is false.
+
+  Args:
+    condition: A potentially ragged tensor of type `bool`
+    x: A potentially ragged tensor (optional).
+    y: A potentially ragged tensor (optional).  Must be specified if `x` is
+      specified.  Must have the same rank and type as `x`.
+    name: A name of the operation (optional)
+
+  Returns:
+    : If both `x` and `y` are `None`:
+      A `Tensor` with shape `(num_true, dim_size(condition))`.
+    : Otherwise:
+      A potentially ragged tensor with the same type, rank, and outermost
+      dimension size as `x` and `y`.
+      `result.ragged_rank = max(x.ragged_rank, y.ragged_rank)`.
+
+  Raises:
+    ValueError: When exactly one of `x` or `y` is non-`None`; or when
+      `condition`, `x`, and `y` have incompatible shapes.
+
+  #### Examples:
+    ```python
+    >>> # Coordinates where condition is true.
+    >>> condition = ragged.constant_value([[True, False, True], [False, True]])
+    >>> ragged.where(condition)
+    [[0, 0], [0, 2], [1, 1]]
+
+    >>> # Elementwise selection between x and y, based on condition.
+    >>> condition = ragged.constant_value([[True, False, True], [False, True]])
+    >>> x=ragged.constant_value([['A', 'B', 'C'], ['D', 'E']])
+    >>> y=ragged.constant_value([['a', 'b', 'c'], ['d', 'e']])
+    >>> ragged.where(condition, x, y)
+    [['A', 'b', 'C'], ['d', 'E']]
+
+    >>> # Row selection between x and y, based on condition.
+    >>> condition = [True, False]
+    >>> x=ragged.constant_value([['A', 'B', 'C'], ['D', 'E']])
+    >>> y=ragged.constant_value([['a', 'b', 'c'], ['d', 'e']])
+    >>> ragged.where(condition, x, y)
+    [['A', 'B', 'C'], ['d', 'e']]
+    ```
+  """
+  if (x is None) != (y is None):
+    raise ValueError('x and y must be either both None or both non-None')
+  with ops.name_scope('RaggedWhere', name, [condition, x, y]):
+    condition = ragged_factory_ops.convert_to_tensor_or_ragged_tensor(
+        condition, name='condition')
+    if x is None:
+      return _coordinate_where(condition)
+    else:
+      x = ragged_factory_ops.convert_to_tensor_or_ragged_tensor(x, name='x')
+      y = ragged_factory_ops.convert_to_tensor_or_ragged_tensor(y, name='y')
+      return _elementwise_where(condition, x, y)
+
+
+def _elementwise_where(condition, x, y):
+  """Ragged version of tf.where(condition, x, y)."""
+  condition_is_ragged = isinstance(condition, ragged_tensor.RaggedTensor)
+  x_is_ragged = isinstance(x, ragged_tensor.RaggedTensor)
+  y_is_ragged = isinstance(y, ragged_tensor.RaggedTensor)
+
+  if not (condition_is_ragged or x_is_ragged or y_is_ragged):
+    return array_ops.where(condition, x, y)
+
+  elif condition_is_ragged and x_is_ragged and y_is_ragged:
+    return ragged_functional_ops.map_inner_values(array_ops.where, condition, x,
+                                                  y)
+  elif not condition_is_ragged:
+    # Concatenate x and y, and then use `gather` to assemble the selected rows.
+    condition.shape.assert_has_rank(1)
+    x_nrows = nrows(x)
+    x_and_y = concat([x, y], axis=0)
+    indices = array_ops.where(condition, math_ops.range(x_nrows),
+                              x_nrows + math_ops.range(nrows(y)))
+    return gather(x_and_y, indices)
+
+  else:
+    raise ValueError('Input shapes do not match.')
+
+
+def _coordinate_where(condition):
+  """Ragged version of tf.where(condition)."""
+  if not isinstance(condition, ragged_tensor.RaggedTensor):
+    return array_ops.where(condition)
+
+  # The coordinate for each `true` value in condition.values.
+  selected_coords = _coordinate_where(condition.values)
+
+  # Convert the first index in each coordinate to a row index and column index.
+  first_index = selected_coords[:, 0]
+  selected_rows = array_ops.gather(value_rowids(condition), first_index)
+  selected_row_starts = array_ops.gather(condition.row_splits, selected_rows)
+  selected_cols = first_index - selected_row_starts
+
+  # Assemble the row & column index with the indices for inner dimensions.
+  return array_ops.concat([
+      array_ops.expand_dims(selected_rows, 1),
+      array_ops.expand_dims(selected_cols, 1), selected_coords[:, 1:]
+  ],
+                          axis=1)
+
+
+#===============================================================================
+# Internal Helper Functions
+#===============================================================================
+
+
+def _lengths_to_splits(lengths):
+  """Returns splits corresponding to the given lengths."""
+  return array_ops.concat([[0], math_ops.cumsum(lengths)], axis=0)
+
+
+def _increase_ragged_rank_to(rt_input, ragged_rank):
+  """Adds ragged dimensions to `rt_input` so it has the desired ragged rank."""
+  if ragged_rank > 0:
+    if not ragged_tensor.is_ragged(rt_input):
+      rt_input = ragged_conversion_ops.from_tensor(rt_input)
+    if rt_input.ragged_rank < ragged_rank:
+      rt_input = rt_input.with_values(
+          _increase_ragged_rank_to(rt_input.values, ragged_rank - 1))
+  return rt_input
+
+
+def _concat_ragged_splits(splits_list):
+  """Concatenates a list of RaggedTensor splits to form a single splits."""
+  pieces = [splits_list[0]]
+  splits_offset = splits_list[0][-1]
+  for splits in splits_list[1:]:
+    pieces.append(splits[1:] + splits_offset)
+    splits_offset += splits[-1]
+  return array_ops.concat(pieces, axis=0)
+
+
+def _repeat_ranges(params, splits, multiple):
+  """Repeats each range of `params` (as specified by `splits`) `multiple` times.
+
+  Let the `i`th range of `params` be defined as
+  `params[splits[i]:splits[i + 1]]`.  Then this function returns a tensor
+  containing range 0 repeated `multiple` times, followed by range 1 repeated
+  `multiple`, ..., followed by the last range repeated `multiple` times.
+
+  Args:
+    params: The `Tensor` whose values should be repeated.
+    splits: A splits tensor indicating the ranges of `params` that should be
+      repeated.
+    multiple: The number of times each range should be repeated.
+
+  Returns:
+    A `Tensor` with the same rank and type as `params`.
+
+  #### Example:
+    ```python
+    >>> _repeat_ranges(['a', 'b', 'c'], [0, 2, 3], 3)
+    ['a', 'b', 'a', 'b', 'a', 'b', 'c', 'c', 'c']
+    ```
+  """
+  # Repeat each split value `multiple` times.  E.g., if `splits=[0 3 4]` and
+  # `multiples=3`, then `repeated_splits=[0 0 0 3 3 3 4 4 4]`.
+  repeated_splits = array_ops.tile(
+      array_ops.expand_dims(splits, axis=1), array_ops.stack([1, multiple]))
+  repeated_splits = array_ops.reshape(repeated_splits, [-1])
+
+  # Divide the splits into repeated starts & repeated limits.  E.g., if
+  # `repeated_splits=[0 0 0 3 3 3 4 4 4]` then `repeated_starts=[0 0 0 3 3 3]`
+  # and `repeated_limits=[3 3 3 4 4 4]`.
+  n_splits = array_ops.shape(repeated_splits, out_type=dtypes.int64)[0]
+  repeated_starts = repeated_splits[:n_splits - multiple]
+  repeated_limits = repeated_splits[multiple:]
+
+  # Get indices for each range from starts to limits, and use those to gather
+  # the values in the desired repetition pattern.
+  offsets = ragged_math_ops.range(repeated_starts, repeated_limits).values
+  return array_ops.gather(params, offsets)
diff --git a/tensorflow/python/ops/ragged/ragged_batch_gather_op_test.py b/tensorflow/python/ops/ragged/ragged_batch_gather_op_test.py
new file mode 100644
index 00000000000..79a2ecd87ae
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_batch_gather_op_test.py
@@ -0,0 +1,199 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for tf.ragged.batch_gather."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import ragged
+from tensorflow.python.platform import googletest
+
+
+class RaggedBatchGatherOpTest(test_util.TensorFlowTestCase,
+                              parameterized.TestCase):
+
+  @parameterized.parameters([
+      #=========================================================================
+      # Docstring Example
+      #=========================================================================
+      dict(
+          descr='Docstring example',
+          params=ragged.constant_value([['a', 'b', 'c'], ['d'], [], ['e']]),
+          indices=ragged.constant_value([[1, 2, 0], [], [], [0, 0]]),
+          expected=ragged.constant_value([[b'b', b'c', b'a'], [], [],
+                                          [b'e', b'e']])),
+      #=========================================================================
+      # 0 Batch Dimensions
+      #=========================================================================
+      dict(
+          descr='params: [P1], indices: [I], result: [I]',
+          params=['a', 'b', 'c', 'd'],
+          indices=[3, 2],
+          expected=[b'd', b'c']),
+      dict(
+          descr='params: [P1, (P2)], indices: [I], result: [I, (P2)]',
+          params=ragged.constant_value([['a', 'b'], [], ['c'], ['d', 'e']]),
+          indices=[3, 2],
+          expected=ragged.constant_value([[b'd', b'e'], [b'c']])),
+      #=========================================================================
+      # 1 Batch Dimension
+      #=========================================================================
+      dict(
+          descr='params: [B1, P1], indices: [B1, I], result: [B1, I]',
+          params=[['a', 'b', 'c'], ['d', 'e', 'f'], ['g', 'h', 'i']],
+          indices=[[2, 0], [0, 1], [1, 0]],
+          expected=[[b'c', b'a'], [b'd', b'e'], [b'h', b'g']]),
+      dict(
+          descr='params: [B1, (P1)], indices: [B1, I], result: [B1, I]',
+          params=ragged.constant_value([['a', 'b', 'c'], ['d', 'e'], ['g']]),
+          indices=[[2, 0], [0, 1], [0, 0]],
+          expected=[[b'c', b'a'], [b'd', b'e'], [b'g', b'g']]),
+      dict(
+          descr='params: [B1, P1], indices: [B1, (I)], result: [B1, (I)]',
+          params=[['a', 'b', 'c'], ['d', 'e', 'f'], ['g', 'h', 'i']],
+          indices=ragged.constant_value([[2, 0, 2], [0], [1]]),
+          expected=ragged.constant_value([[b'c', b'a', b'c'], [b'd'], [b'h']])),
+      dict(
+          descr=('params: [B1, (P1), (P2), P3], indices: [B1, I], '
+                 'result: [B1, I, (P2), P3]'),
+          params=ragged.constant_value(
+              [[[['a']], [['b'], ['c']]], [[['d'], ['e']], [['f']]], [[['g']]]],
+              ragged_rank=2),
+          indices=[[1, 0], [0, 1], [0, 0]],
+          expected=ragged.constant_value(
+              [[[[b'b'], [b'c']], [[b'a']]], [[[b'd'], [b'e']], [[b'f']]],
+               [[[b'g']], [[b'g']]]],
+              ragged_rank=2)),
+      #=========================================================================
+      # 2 Batch Dimensions
+      #=========================================================================
+      dict(
+          descr=('params: [B1, B2, P1], indices: [B1, B2, I], '
+                 'result: [B1, B2, I]'),
+          params=[[['a', 'b', 'c']], [['d', 'e', 'f']], [['g', 'h', 'i']]],
+          indices=[[[2, 0]], [[0, 1]], [[1, 0]]],
+          expected=[[[b'c', b'a']], [[b'd', b'e']], [[b'h', b'g']]]),
+      dict(
+          descr=('params: [B1, (B2), P1], indices: [B1, (B2), I], '
+                 'result: [B1, (B2), I]'),
+          params=ragged.constant_value(
+              [[['a', 'b', 'c'], ['d', 'e', 'f']], [['g', 'h', 'i']]],
+              ragged_rank=1),
+          indices=ragged.constant_value([[[2, 0], [0, 1]], [[1, 0]]],
+                                        ragged_rank=1),
+          expected=ragged.constant_value(
+              [[[b'c', b'a'], [b'd', b'e']], [[b'h', b'g']]], ragged_rank=1)),
+      dict(
+          descr=('params: [B1, (B2), (P1)], indices: [B1, (B2), I], '
+                 'result: [B1, (B2), I]'),
+          params=ragged.constant_value([[['a', 'b', 'c'], ['d']], [['e', 'f']]],
+                                       ragged_rank=2),
+          indices=ragged.constant_value([[[2, 0], [0, 0]], [[1, 0]]],
+                                        ragged_rank=1),
+          expected=ragged.constant_value(
+              [[[b'c', b'a'], [b'd', b'd']], [[b'f', b'e']]], ragged_rank=1)),
+      dict(
+          descr=('params: [B1, (B2), P1], indices: [B1, (B2), (I)], '
+                 'result: [B1, (B2), (I)]'),
+          params=ragged.constant_value(
+              [[['a', 'b', 'c'], ['d', 'e', 'f']], [['g', 'h', 'i']]],
+              ragged_rank=1),
+          indices=ragged.constant_value([[[2, 1, 0], [0]], [[1, 1]]],
+                                        ragged_rank=2),
+          expected=ragged.constant_value(
+              [[[b'c', b'b', b'a'], [b'd']], [[b'h', b'h']]], ragged_rank=2)),
+      #=========================================================================
+      # 3 Batch Dimensions
+      #=========================================================================
+      dict(
+          descr=(
+              'params: [B1, (B2), (B3), (P1)], indices: [B1, (B2), (B3), I], '
+              'result: [B1, (B2), (B3), I]'),
+          params=ragged.constant_value(
+              [[[['a', 'b', 'c'], ['d']], [['e', 'f']]]], ragged_rank=3),
+          indices=ragged.constant_value([[[[2, 0], [0, 0]], [[1, 0]]]],
+                                        ragged_rank=2),
+          expected=ragged.constant_value(
+              [[[[b'c', b'a'], [b'd', b'd']], [[b'f', b'e']]]], ragged_rank=2)),
+  ])
+  def testRaggedBatchGather(self, descr, params, indices, expected):
+    result = ragged.batch_gather(params, indices)
+    self.assertEqual(
+        getattr(result, 'ragged_rank', 0), getattr(expected, 'ragged_rank', 0))
+    with self.test_session():
+      if hasattr(expected, 'tolist'):
+        expected = expected.tolist()
+      self.assertEqual(result.eval().tolist(), expected)
+
+  def testRaggedBatchGatherUnknownRankError(self):
+    params = [['a', 'b'], ['c', 'd']]
+    indices = array_ops.placeholder(dtypes.int32, shape=None)
+    ragged_indices = ragged.from_row_splits(indices, [0, 2, 4])
+
+    with self.assertRaisesRegexp(
+        ValueError, 'batch_gather does not allow indices with unknown shape.'):
+      ragged.batch_gather(params, indices)
+
+    with self.assertRaisesRegexp(
+        ValueError, 'batch_gather does not allow indices with unknown shape.'):
+      ragged.batch_gather(params, ragged_indices)
+
+  @parameterized.parameters([
+      dict(
+          params=ragged.constant([['a'], ['b'], ['c']]),
+          indices=ragged.constant([[0], [0]]),
+          message='Dimensions 3 and 2 are not compatible'),
+      dict(
+          params=[[[1, 2], [3, 4]], [[5, 6], [7, 8]]],
+          indices=ragged.constant([[[0, 0], [0, 0, 0]], [[0]]]),
+          message='batch shape from indices does not match params shape'),
+      dict(
+          params=ragged.constant([[[0, 0], [0, 0, 0]], [[0]]]),
+          indices=ragged.constant([[[0, 0]], [[0, 0, 0]], [[0]]]),
+          message='Dimensions must be equal, but are 3 and 4'),
+      dict(
+          params=ragged.constant([[[0, 0], [0, 0, 0]], [[0]], [[0]]]),
+          indices=ragged.constant([[[0, 0]], [[0, 0, 0]], [[0]]]),
+          error=errors.InvalidArgumentError,
+          message='Condition x == y did not hold element-wise'),
+      dict(
+          params=ragged.constant(['a', 'b', 'c']),
+          indices=ragged.constant([[0], [0]]),
+          message='batch shape from indices does not match params shape'),
+      dict(params=ragged.constant_value([['a']]),
+           indices=0,
+           message='indices.rank must be at least 1.'),
+      dict(params=ragged.constant_value([['a']]),
+           indices=[[[0]]],
+           message='batch shape from indices does not match params shape'),
+  ])
+  def testRaggedBatchGatherStaticError(self,
+                                       params,
+                                       indices,
+                                       message,
+                                       error=ValueError):
+    with self.assertRaisesRegexp(error, message):
+      ragged.batch_gather(params, indices)
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/python/ops/ragged/ragged_boolean_mask_op_test.py b/tensorflow/python/ops/ragged/ragged_boolean_mask_op_test.py
new file mode 100644
index 00000000000..b3279c1e840
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_boolean_mask_op_test.py
@@ -0,0 +1,351 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for ragged.boolean_mask."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import ragged
+from tensorflow.python.platform import googletest
+
+
+class RaggedBooleanMaskOpTest(test_util.TensorFlowTestCase,
+                              parameterized.TestCase):
+  # Define short constants for true & false, so the data & mask can be lined
+  # up in the examples below.  This makes it easier to read the examples, to
+  # see which values should be kept vs. masked.
+  T = True
+  F = False
+
+  @parameterized.parameters([
+      #=========================================================================
+      # Docstring examples
+      #=========================================================================
+      dict(
+          descr='Docstring example 1',
+          data=[[1, 2, 3], [4, 5, 6], [7, 8, 9]],
+          mask=[[T, F, T], [F, F, F], [T, F, F]],
+          keepdims=False,
+          expected=[1, 3, 7]),
+      dict(
+          descr='Docstring example 2',
+          data=[[1, 2, 3], [4, 5, 6], [7, 8, 9]],
+          mask=[[T, F, T], [F, F, F], [T, F, F]],
+          keepdims=True,
+          expected=ragged.constant_value([[1, 3], [], [7]])),
+      dict(
+          descr='Docstring example 3',
+          data=ragged.constant_value([[1, 2, 3], [4], [5, 6]]),
+          mask=ragged.constant_value([[F, F, T], [F], [T, T]]),
+          keepdims=False,
+          expected=[3, 5, 6]),
+      dict(
+          descr='Docstring example 4',
+          data=ragged.constant_value([[1, 2, 3], [4], [5, 6]]),
+          mask=ragged.constant_value([[F, F, T], [F], [T, T]]),
+          keepdims=True,
+          expected=ragged.constant_value([[3], [], [5, 6]])),
+      dict(
+          descr='Docstring example 5',
+          data=ragged.constant_value([[1, 2, 3], [4], [5, 6]]),
+          mask=[True, False, True],
+          keepdims=False,
+          expected=ragged.constant_value([[1, 2, 3], [5, 6]])),
+      #=========================================================================
+      # Uniform data and uniform mask.
+      #=========================================================================
+      dict(
+          descr='data.shape=[7]; mask.shape=[7]; keepdims=True',
+          data=[1, 2, 3, 4, 5, 6, 7],
+          mask=[T, F, T, T, F, F, F],
+          keepdims=True,
+          expected=[1, 3, 4]),
+      dict(
+          descr='data.shape=[5, 3]; mask.shape=[5]; keepdims=True',
+          data=[[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12], [13, 14, 15]],
+          mask=[True, False, True, True, False],
+          keepdims=True,
+          expected=[[1, 2, 3], [7, 8, 9], [10, 11, 12]]),
+      dict(
+          descr='data.shape=[5, 3]; mask.shape=[5, 3]; keepdims=True',
+          data=[[1, 2, 3], [4, 5, 6], [7, 8, 9], [0, 1, 2], [3, 4, 5]],
+          mask=[[F, F, F], [T, F, T], [T, T, T], [F, F, F], [T, T, F]],
+          keepdims=True,
+          expected=ragged.constant_value([[], [4, 6], [7, 8, 9], [], [3, 4]])),
+      dict(
+          descr='data.shape=[3, 2, 2]; mask.shape=[3]; keepdims=True',
+          data=[[[1, 2], [3, 4]], [[5, 6], [7, 8]], [[2, 4], [6, 8]]],
+          mask=[F, F, T],
+          keepdims=True,
+          expected=[[[2, 4], [6, 8]]]),
+      dict(
+          descr='data.shape=[3, 2, 2]; mask.shape=[3]; keepdims=False',
+          data=[[[1, 2], [3, 4]], [[5, 6], [7, 8]], [[2, 4], [6, 8]]],
+          mask=[F, F, T],
+          keepdims=False,
+          expected=[[[2, 4], [6, 8]]]),
+      dict(
+          descr='data.shape=[3, 2, 2]; mask.shape=[3, 2]; keepdims=True',
+          data=[[[1, 2], [3, 4]], [[5, 6], [7, 8]], [[2, 4], [6, 8]]],
+          mask=[[T, F], [T, T], [F, F]],
+          keepdims=True,
+          expected=ragged.constant_value([[[1, 2]], [[5, 6], [7, 8]], []],
+                                         ragged_rank=1)),
+      dict(
+          descr='data.shape=[3, 2, 2]; mask.shape=[3, 2]; keepdims=False',
+          data=[[[1, 2], [3, 4]], [[5, 6], [7, 8]], [[2, 4], [6, 8]]],
+          mask=[[T, F], [T, T], [F, F]],
+          keepdims=False,
+          expected=[[1, 2], [5, 6], [7, 8]]),
+      dict(
+          descr='data.shape=[3, 2, 2]; mask.shape=[3, 2, 2]; keepdims=True',
+          data=[[[1, 2], [3, 4]], [[5, 6], [7, 8]], [[2, 4], [6, 8]]],
+          mask=[[[T, T], [F, T]], [[F, F], [F, F]], [[T, F], [T, T]]],
+          keepdims=True,
+          expected=ragged.constant_value(
+              [[[1, 2], [4]], [[], []], [[2], [6, 8]]])),
+      dict(
+          descr='data.shape=mask.shape=[2, 2, 2, 2]; keepdims=True',
+          data=[[[[1, 2], [3, 4]], [[5, 6], [7, 8]]],
+                [[[2, 4], [6, 8]], [[1, 3], [5, 7]]]],
+          mask=[[[[T, T], [F, F]], [[T, F], [F, F]]],
+                [[[F, F], [F, F]], [[T, T], [T, F]]]],
+          keepdims=True,
+          expected=ragged.constant_value(
+              [[[[1, 2], []], [[5], []]], [[[], []], [[1, 3], [5]]]])),
+      dict(
+          descr='data.shape=mask.shape=[2, 2, 2, 2]; keepdims=False',
+          data=[[[[1, 2], [3, 4]], [[5, 6], [7, 8]]],
+                [[[2, 4], [6, 8]], [[1, 3], [5, 7]]]],
+          mask=[[[[T, T], [F, F]], [[T, F], [F, F]]],
+                [[[F, F], [F, F]], [[T, T], [T, F]]]],
+          keepdims=False,
+          expected=[1, 2, 5, 1, 3, 5]),
+
+      #=========================================================================
+      # Ragged data and ragged mask.
+      #=========================================================================
+      dict(
+          descr='data.shape=[5, (D2)]; mask.shape=[5, (D2)]',
+          data=ragged.constant_value(
+              [[1, 2], [3, 4, 5, 6], [7, 8, 9], [], [1, 2, 3]]),
+          mask=ragged.constant_value(
+              [[F, F], [F, T, F, T], [F, F, F], [], [T, F, T]]),
+          keepdims=True,
+          expected=ragged.constant_value([[], [4, 6], [], [], [1, 3]])),
+      dict(
+          descr='data.shape=[3, (D2), (D3)]; mask.shape=[3, (D2)]',
+          data=ragged.constant_value(
+              [[[1, 2], [3, 4]], [[5, 6], [7, 8]], [[2, 4], [6, 8]]]),
+          mask=ragged.constant_value([[T, F], [T, T], [F, F]]),
+          keepdims=True,
+          expected=ragged.constant_value(
+              [[[1, 2]], [[5, 6], [7, 8]], []])),
+      dict(
+          descr='data.shape=[3, (D2), (D3)]; mask.shape=[3, (D2)]',
+          data=ragged.constant_value(
+              [[[1, 2], [3, 4]], [[5, 6], [7, 8]], [[2, 4], [6, 8]]]),
+          mask=ragged.constant_value([[T, F], [T, T], [F, F]]),
+          keepdims=False,
+          expected=ragged.constant_value([[1, 2], [5, 6], [7, 8]])),
+      dict(
+          descr='data.shape=[3, (D2), D3]; mask.shape=[3, (D2)]',
+          data=ragged.constant_value(
+              [[[1, 2], [3, 4]], [[5, 6], [7, 8], [2, 4]], [[6, 8]]],
+              ragged_rank=1),
+          mask=ragged.constant_value([[T, F], [T, T, F], [F]]),
+          keepdims=True,
+          expected=ragged.constant_value(
+              [[[1, 2]], [[5, 6], [7, 8]], []],
+              ragged_rank=1)),
+      dict(
+          descr='data.shape=[3, (D2), D3]; mask.shape=[3, (D2)]',
+          data=ragged.constant_value(
+              [[[1, 2], [3, 4]], [[5, 6], [7, 8]], [[2, 4], [6, 8]]],
+              ragged_rank=1),
+          mask=ragged.constant_value([[T, F], [T, T], [F, F]]),
+          keepdims=False,
+          expected=[[1, 2], [5, 6], [7, 8]]),
+      dict(
+          descr='data.shape=[3, (D2), (D3)]; mask.shape=[3, (D2), (D3)]',
+          data=ragged.constant_value(
+              [[[1, 2], [3, 4]], [[5, 6], [7, 8]], [[2, 4]]]),
+          mask=ragged.constant_value(
+              [[[T, T], [F, T]], [[F, F], [F, F]], [[T, F]]]),
+          keepdims=True,
+          expected=ragged.constant_value(
+              [[[1, 2], [4]], [[], []], [[2]]])),
+      dict(
+          descr=('data.shape=[3, (D2), (D3), (D4)]; '
+                 'mask.shape=[3, (D2), (D3), (D4)]'),
+          data=ragged.constant_value(
+              [[[[1, 2], [3, 4]], [[5, 6]]], [[[2, 4], [6, 8]]]]),
+          mask=ragged.constant_value(
+              [[[[T, T], [F, F]], [[T, F]]], [[[F, F], [T, T]]]]),
+          keepdims=True,
+          expected=ragged.constant_value(
+              [[[[1, 2], []], [[5]]], [[[], [6, 8]]]])),
+
+      #=========================================================================
+      # Ragged mask and uniform data
+      #=========================================================================
+      dict(
+          descr='data.shape=[2, 3]; mask.shape=[2, (3)]',
+          data=[[1, 2, 3], [4, 5, 6]],
+          mask=ragged.constant_value([[T, F, F], [F, T, T]]),
+          keepdims=True,
+          expected=ragged.constant_value([[1], [5, 6]])),
+      dict(
+          descr='data.shape=[2, 3, 2]; mask.shape=[2, (3)]',
+          data=[[[1, 2], [3, 4], [5, 6]], [[7, 8], [9, 0], [2, 4]]],
+          mask=ragged.constant_value([[T, F, F], [F, T, T]]),
+          keepdims=True,
+          expected=ragged.constant_value(
+              [[[1, 2]], [[9, 0], [2, 4]]],
+              ragged_rank=1)),
+      dict(
+          descr='data.shape=[2, 3, 2]; mask.shape=[2, (3), 2]',
+          data=[[[1, 2], [3, 4], [5, 6]], [[7, 8], [9, 0], [2, 4]]],
+          mask=ragged.constant_value(
+              [[[T, F], [F, F], [T, T]], [[T, F], [F, T], [F, F]]],
+              ragged_rank=1),
+          keepdims=True,
+          expected=ragged.constant_value([[[1], [], [5, 6]], [[7], [0], []]])),
+
+      #=========================================================================
+      # Ragged data and uniform mask.
+      #=========================================================================
+      dict(
+          descr='data.shape=[4, (D2)]; mask.shape=[4]',
+          data=ragged.constant_value([[1, 2, 3], [4], [], [5, 6]]),
+          mask=[T, F, T, F],
+          keepdims=False,
+          expected=ragged.constant_value([[1, 2, 3], []])),
+      dict(
+          descr='data.shape=[4, (D2), (D3)]; mask.shape=[4]',
+          data=ragged.constant_value(
+              [[[1, 2, 3]], [[4], []], [[5, 6]], []]),
+          mask=[T, F, T, T],
+          keepdims=False,
+          expected=ragged.constant_value([[[1, 2, 3]], [[5, 6]], []])),
+      dict(
+          descr='data.shape=[4, (D2), 2]; mask.shape=[4]',
+          data=ragged.constant_value(
+              [[[1, 2], [3, 4]], [], [[5, 6]], [[7, 8], [9, 0], [1, 2]]],
+              ragged_rank=1),
+          mask=[T, F, F, T],
+          keepdims=False,
+          expected=ragged.constant_value(
+              [[[1, 2], [3, 4]], [[7, 8], [9, 0], [1, 2]]],
+              ragged_rank=1)),
+      dict(
+          descr='data.shape=[4, (D2), 2]; mask.shape=[4]',
+          data=ragged.constant_value(
+              [[[1, 2], [3, 4]], [], [[5, 6]], [[7, 8], [9, 0], [1, 2]]],
+              ragged_rank=1),
+          mask=[T, F, F, T],
+          keepdims=True,
+          expected=ragged.constant_value(
+              [[[1, 2], [3, 4]], [[7, 8], [9, 0], [1, 2]]],
+              ragged_rank=1)),
+      dict(
+          descr='data.shape=[1, (2)]; mask.shape=[1, 2]',
+          data=ragged.constant_value([[1, 2]]),
+          mask=[[T, F]],
+          keepdims=True,
+          expected=ragged.constant_value([[1]])),
+      dict(
+          descr='data.shape=[2, (2), (D3)]; mask.shape=[2, 2]',
+          data=ragged.constant_value([[[1], [2, 3]], [[], [4, 5, 6]]]),
+          mask=[[T, F], [T, T]],
+          keepdims=True,
+          expected=ragged.constant_value([[[1]], [[], [4, 5, 6]]])),
+      dict(
+          descr='data.shape=[2, (2), 3]; mask.shape=[2, 2]',
+          data=ragged.constant_value(
+              [[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [2, 4, 6]]],
+              ragged_rank=1),
+          mask=[[T, F], [T, T]],
+          keepdims=True,
+          expected=ragged.constant_value(
+              [[[1, 2, 3]], [[7, 8, 9], [2, 4, 6]]],
+              ragged_rank=1)),
+      dict(
+          descr='data.shape=[2, (2), 3]; mask.shape=[2, 2, 3]',
+          data=ragged.constant_value(
+              [[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [2, 4, 6]]],
+              ragged_rank=1),
+          mask=[[[T, F, F], [T, F, T]], [[T, F, T], [F, F, F]]],
+          keepdims=True,
+          expected=ragged.constant_value([[[1], [4, 6]], [[7, 9], []]])),
+  ])  # pyformat: disable
+  def testBooleanMask(self, descr, data, mask, keepdims, expected):
+    actual = ragged.boolean_mask(data, mask, keepdims=keepdims)
+    self.assertEqual(
+        getattr(actual, 'ragged_rank', 0), getattr(expected, 'ragged_rank', 0))
+    with self.test_session():
+      if isinstance(expected, ragged.RaggedTensorValue):
+        expected = expected.tolist()
+      self.assertEqual(actual.eval().tolist(), expected)
+
+  def testErrors(self):
+    self.assertRaisesRegexp(ValueError,
+                            r'mask\.shape\.ndims must be kown statically',
+                            ragged.boolean_mask, [[1, 2]],
+                            array_ops.placeholder(dtypes.bool))
+
+    self.assertRaisesRegexp(TypeError,
+                            "Expected bool, got 0 of type 'int' instead.",
+                            ragged.boolean_mask, [[1, 2]], [[0, 1]])
+    self.assertRaisesRegexp(
+        ValueError, 'Tensor conversion requested dtype bool for '
+        'RaggedTensor with dtype int32', ragged.boolean_mask,
+        ragged.constant([[1, 2]]), ragged.constant([[0, 0]]))
+
+    self.assertRaisesRegexp(
+        ValueError, r'Shapes \(1, 2\) and \(1, 3\) are incompatible',
+        ragged.boolean_mask, [[1, 2]], [[True, False, True]])
+
+    # self.assertRaisesRegexp(ValueError,
+    #                         r'data=.* is non-ragged but mask=.* is ragged',
+    #                         ragged.boolean_mask, [[1, 2]],
+    #                         ragged.constant([[True, False]]))
+
+    # self.assertRaisesRegexp(
+    #     ValueError, r'data=.* is ragged but mask=.* is non-ragged',
+    #     ragged.boolean_mask, ragged.constant([[1, 2]]), [[True, False]])
+
+    self.assertRaisesRegexp(errors.InvalidArgumentError,
+                            r'Inputs must have identical ragged splits',
+                            ragged.boolean_mask, ragged.constant([[1, 2]]),
+                            ragged.constant([[True, False, True]]))
+
+    self.assertRaisesRegexp(ValueError, 'mask cannot be scalar',
+                            ragged.boolean_mask, [[1, 2]], True)
+
+    self.assertRaisesRegexp(ValueError,
+                            'mask cannot be scalar', ragged.boolean_mask,
+                            ragged.constant([[1, 2]]), True)
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/python/ops/ragged/ragged_concat_op_test.py b/tensorflow/python/ops/ragged/ragged_concat_op_test.py
new file mode 100644
index 00000000000..6b1a602d049
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_concat_op_test.py
@@ -0,0 +1,308 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for ragged.concat."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import ragged
+from tensorflow.python.platform import googletest
+
+
+class RaggedConcatOpTest(test_util.TensorFlowTestCase, parameterized.TestCase):
+
+  def _rt_inputs_to_tensors(self, rt_inputs, ragged_ranks=None):
+    if ragged_ranks is None:
+      ragged_ranks = [None] * len(rt_inputs)
+    return [
+        ragged.constant(rt_input, ragged_rank=rrank)
+        if rrank != 0 else constant_op.constant(rt_input)
+        for (rt_input, rrank) in zip(rt_inputs, ragged_ranks)
+    ]
+
+  @parameterized.parameters(
+      dict(
+          descr='Two rank-2 inputs (ragged_rank=1), axis=0',
+          rt_inputs=(
+              [['a00', 'a01'], [], ['a20', 'a21']],   # shape=(3, None)
+              [['b00'], ['b10']]),                    # shape=(2, None)
+          axis=0,
+          expected=[[b'a00', b'a01'], [], [b'a20', b'a21'], [b'b00'],
+                    [b'b10']]),
+      dict(
+          descr='Two rank-2 inputs (ragged_rank=1), axis=1',
+          rt_inputs=(
+              [['a00', 'a01'], [], ['a20', 'a21', 'a22']],   # shape=(3, None)
+              [['b00'], ['b10', 'b11', 'b12'], ['b20']]),    # shape=(3, None)
+          axis=1,
+          expected=[
+              [b'a00', b'a01', b'b00'],
+              [b'b10', b'b11', b'b12'],
+              [b'a20', b'a21', b'a22', b'b20']]),
+      dict(
+          descr='Two rank-2 inputs (ragged_rank=1), axis=-2',
+          rt_inputs=(
+              [['a00', 'a01'], [], ['a20', 'a21']],   # shape=(3, None)
+              [['b00'], ['b10']]),                    # shape=(2, None)
+          axis=-2,
+          expected=[[b'a00', b'a01'], [], [b'a20', b'a21'], [b'b00'],
+                    [b'b10']]),
+      dict(
+          descr='Two rank-2 inputs (ragged_rank=1), axis=-1',
+          rt_inputs=(
+              [['a00', 'a01'], [], ['a20', 'a21', 'a22']],   # shape=(3, None)
+              [['b00'], ['b10', 'b11', 'b12'], ['b20']]),    # shape=(3, None)
+          axis=-1,
+          expected=[
+              [b'a00', b'a01', b'b00'],
+              [b'b10', b'b11', b'b12'],
+              [b'a20', b'a21', b'a22', b'b20']],
+          expected_shape=[3, None]),
+      dict(
+          descr='Three rank-2 inputs (ragged_rank=1), axis=0',
+          rt_inputs=(
+              [['a00', 'a01'], [], ['a20', 'a21', 'a22']],   # shape=(3, None)
+              [['b00'], ['b10']],                            # shape=(2, None)
+              [['c00'], ['c10', 'c11'], ['c21']]),           # shape=(3, None)
+          axis=0,
+          expected=[[b'a00', b'a01'], [], [b'a20', b'a21', b'a22'], [b'b00'],
+                    [b'b10'], [b'c00'], [b'c10', b'c11'], [b'c21']]),
+      dict(
+          descr='Three rank-2 inputs (ragged_rank=1), axis=1',
+          rt_inputs=(
+              [['a00', 'a01'], [], ['a20', 'a21', 'a22']],   # shape=(3, None)
+              [['b00'], ['b10', 'b11', 'b12'], ['b20']],     # shape=(3, None)
+              [[], ['c10', 'c11'], ['c20', 'c21']]),         # shape=(3, None)
+          axis=1,
+          expected=[
+              [b'a00', b'a01', b'b00'],
+              [b'b10', b'b11', b'b12', b'c10', b'c11'],
+              [b'a20', b'a21', b'a22', b'b20', b'c20', b'c21']]),
+      dict(
+          descr='Three rank-3 inputs (ragged_rank=2), axis=0',
+          rt_inputs=(
+              [[['a000', 'a001'], ['a010']],
+               [['a100', 'a101', 'a102'], ['a110', 'a111']]],
+              [[['b000']], [['b100', 'b101'], ['b110']]],
+              [[], [['c100', 'c101', 'c102', 'c103']], [[], ['c210', 'c211']]]),
+          axis=0,
+          expected=[
+              [[b'a000', b'a001'], [b'a010']],
+              [[b'a100', b'a101', b'a102'], [b'a110', b'a111']],
+              [[b'b000']],
+              [[b'b100', b'b101'], [b'b110']],
+              [],
+              [[b'c100', b'c101', b'c102', b'c103']],
+              [[], [b'c210', b'c211']]]),
+      dict(
+          descr='Three rank-3 inputs (ragged_rank=2), axis=1',
+          rt_inputs=(
+              [[['a000', 'a001'], ['a010']],
+               [['a100', 'a101', 'a102'], ['a110', 'a111']]],
+              [[['b000']], [['b100', 'b101'], ['b110']]],
+              [[], [[], ['c110', 'c111']]]),
+          axis=1,
+          expected=[
+              [[b'a000', b'a001'], [b'a010'], [b'b000']],
+              [[b'a100', b'a101', b'a102'], [b'a110', b'a111'],
+               [b'b100', b'b101'], [b'b110'], [], [b'c110', b'c111']]]),
+      dict(
+          descr='Three rank-3 inputs (ragged_rank=2), axis=2',
+          rt_inputs=(
+              [[['a000', 'a001'], ['a010']],
+               [['a100', 'a101', 'a102'], ['a110', 'a111']]],
+              [[[], ['b010', 'b011']], [['b100', 'b101'], ['b110']]],
+              [[['c000'], ['c010']], [[], ['c110', 'c111']]]),
+          axis=2,
+          expected=[
+              [[b'a000', b'a001', b'c000'],
+               [b'a010', b'b010', b'b011', b'c010']],
+              [[b'a100', b'a101', b'a102', b'b100', b'b101'],
+               [b'a110', b'a111', b'b110', b'c110', b'c111']]]),
+      dict(
+          descr='Three rank-3 inputs (ragged_rank=2), axis=-1',
+          rt_inputs=(
+              [[['a000', 'a001'], ['a010']],
+               [['a100', 'a101', 'a102'], ['a110', 'a111']]],
+              [[[], ['b010', 'b011']], [['b100', 'b101'], ['b110']]],
+              [[['c000'], ['c010']], [[], ['c110', 'c111']]]),
+          axis=-1,
+          expected=[
+              [[b'a000', b'a001', b'c000'],
+               [b'a010', b'b010', b'b011', b'c010']],
+              [[b'a100', b'a101', b'a102', b'b100', b'b101'],
+               [b'a110', b'a111', b'b110', b'c110', b'c111']]]),
+      dict(
+          descr='ragged_concat([uniform, ragged, uniform], axis=1)',
+          ragged_ranks=[0, 1, 0],
+          rt_inputs=(
+              [['0('], ['1('], ['2(']],                   # shape=(3, 1)
+              [['b00'], ['b10', 'b11', 'b12'], ['b20']],  # shape=(3, None)
+              [[')0'], [')1'], [')2']]),                  # shape=(3, 1)
+          axis=1,
+          expected=[
+              [b'0(', b'b00', b')0'],
+              [b'1(', b'b10', b'b11', b'b12', b')1'],
+              [b'2(', b'b20', b')2']]),
+      dict(
+          descr='ragged_concat([uniform, uniform], axis=0)',
+          ragged_ranks=[0, 0],
+          rt_inputs=(
+              [['a00', 'a01'], ['a10', 'a11'], ['a20', 'a21']],  # shape=(3, 2)
+              [['b00', 'b01', 'b02'], ['b10', 'b11', 'b12']]),   # shape=(2, 3)
+          axis=0,
+          expected=[
+              [b'a00', b'a01'], [b'a10', b'a11'], [b'a20', b'a21'],
+              [b'b00', b'b01', b'b02'], [b'b10', b'b11', b'b12']],
+          expected_ragged_rank=1),
+      dict(
+          descr='ragged_concat([uniform, ragged], axis=0)',
+          ragged_ranks=[0, 1],
+          rt_inputs=(
+              [['a00', 'a01'], ['a10', 'a11'], ['a20', 'a21']],  # shape=(3, 2)
+              [['b00', 'b01', 'b02'], ['b10', 'b11', 'b12']]),   # shape=(2, 3)
+          axis=0,
+          expected=[
+              [b'a00', b'a01'], [b'a10', b'a11'], [b'a20', b'a21'],
+              [b'b00', b'b01', b'b02'], [b'b10', b'b11', b'b12']]),
+      dict(
+          descr='ragged_concat([uniform, ragged], axis=0) with rank-3 inputs',
+          ragged_ranks=[0, 2],
+          rt_inputs=(
+              [[[0, 1], [2, 3]], [[4, 5], [6, 7]]],  # shape = (2, 2, 2)
+              [[[8], [8, 8]]]),                      # shape = (2, None, None)
+          axis=0,
+          expected=[[[0, 1], [2, 3]], [[4, 5], [6, 7]], [[8], [8, 8]]]),
+      dict(
+          descr='Two rank-3 inputs with ragged_rank=1, axis=-1',
+          ragged_ranks=[1, 1],
+          rt_inputs=(
+              [[[0, 1], [2, 3], [4, 5]], [], [[6, 7], [8, 9]]],
+              [[[9, 8], [7, 6], [5, 4]], [], [[3, 2], [1, 0]]]),
+          axis=-1,
+          expected=[
+              [[0, 1, 9, 8], [2, 3, 7, 6], [4, 5, 5, 4]], [],
+              [[6, 7, 3, 2], [8, 9, 1, 0]]],
+          expected_ragged_rank=1),
+      dict(
+          descr='ragged_concat([vector, vector], axis=0)',
+          ragged_ranks=[0, 0],
+          rt_inputs=([1, 2, 3], [4, 5, 6]),
+          axis=0,
+          expected=[1, 2, 3, 4, 5, 6]),
+      dict(
+          descr='One input (so ragged_conat is a noop)',
+          rt_inputs=([['a00', 'a01'], [], ['a20', 'a21']],),
+          axis=0,
+          expected=[[b'a00', b'a01'], [], [b'a20', b'a21']]),
+  )   # pyformat: disable
+  def testRaggedConcat(self,
+                       descr,
+                       rt_inputs,
+                       axis,
+                       expected,
+                       ragged_ranks=None,
+                       expected_ragged_rank=None,
+                       expected_shape=None):
+    rt_inputs = self._rt_inputs_to_tensors(rt_inputs, ragged_ranks)
+    concatenated = ragged.concat(rt_inputs, axis)
+    if expected_ragged_rank is not None:
+      self.assertEqual(concatenated.ragged_rank, expected_ragged_rank)
+    if expected_shape is not None:
+      self.assertEqual(concatenated.shape.as_list(), expected_shape)
+    with self.test_session():
+      self.assertEqual(concatenated.eval().tolist(), expected)
+
+  @parameterized.parameters(
+      dict(
+          rt_inputs=(),
+          axis=0,
+          error=ValueError,
+          message=r'rt_inputs may not be empty\.'),
+      dict(
+          rt_inputs=([[1, 2]], [[3, 4]]),
+          axis=r'foo',
+          error=TypeError,
+          message='axis must be an int'),
+      dict(
+          rt_inputs=([[1, 2]], [[3, 4]]),
+          axis=-3,
+          error=ValueError,
+          message='axis=-3 out of bounds: expected -2<=axis<2'),
+      dict(
+          rt_inputs=([[1, 2]], [[3, 4]]),
+          axis=2,
+          error=ValueError,
+          message='axis=2 out of bounds: expected -2<=axis<2'),
+      dict(
+          ragged_ranks=(0, 0),
+          rt_inputs=([[1, 2]], [[3, 4], [5, 6]]),
+          axis=1,
+          error=ValueError,
+          message='Dimension 0 in both shapes must be equal'),
+  )
+  def testStaticError(self, rt_inputs, axis, error, message, ragged_ranks=None):
+    rt_inputs = self._rt_inputs_to_tensors(rt_inputs, ragged_ranks)
+    self.assertRaisesRegexp(error, message, ragged.concat, rt_inputs, axis)
+
+  @parameterized.parameters([
+      dict(
+          ragged_ranks=(1, 1),
+          rt_inputs=([[1, 2]], [[3, 4], [5, 6]]),
+          axis=1,
+          error=errors.InvalidArgumentError,
+          message='Input tensors have incompatible shapes'),
+  ])
+  def testRuntimeError(self, rt_inputs, axis, error, message,
+                       ragged_ranks=None):
+    rt_inputs = [
+        array_ops.placeholder_with_default(rt, shape=None) for rt in rt_inputs
+    ]
+    concatenated = ragged.concat(rt_inputs, axis)
+    with self.test_session():
+      self.assertRaisesRegexp(error, message, concatenated.eval)
+
+  def testNegativeAxisWithUnknownRankError(self):
+    rt_inputs = [
+        array_ops.placeholder(dtypes.int64),
+        array_ops.placeholder(dtypes.int64)
+    ]
+    self.assertRaisesRegexp(
+        ValueError, r'axis may only be negative if ndims is statically known.',
+        ragged.concat, rt_inputs, -1)
+
+  def testSingleTensorInput(self):
+    """Tests ragged_concat with a single tensor input.
+
+    Usually, we pass a list of values in for rt_inputs.  However, you can
+    also pass in a single value (as with tf.concat), in which case it simply
+    returns that tensor.  This test exercises that path.
+    """
+    rt_inputs = ragged.constant([[1, 2], [3, 4]])
+    concatenated = ragged.concat(rt_inputs, 0)
+    with self.test_session():
+      self.assertEqual(concatenated.eval().tolist(), [[1, 2], [3, 4]])
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/python/ops/ragged/ragged_const_op_test.py b/tensorflow/python/ops/ragged/ragged_const_op_test.py
new file mode 100644
index 00000000000..13f79c57292
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_const_op_test.py
@@ -0,0 +1,370 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for ragged.constant."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops.ragged import ragged_factory_ops
+from tensorflow.python.ops.ragged import ragged_tensor
+from tensorflow.python.platform import googletest
+
+
+class RaggedConstOpTest(test_util.TensorFlowTestCase, parameterized.TestCase):
+
+  @parameterized.parameters(
+      #=========================================================================
+      # 0-dimensional tensors.
+      dict(pylist=b'x', expected_shape=()),
+
+      #=========================================================================
+      # 1-dimensional tensors.
+      dict(pylist=[1, 2, 3], expected_shape=(3,)),
+
+      #=========================================================================
+      # 2-dimensional tensors.
+      dict(pylist=[[1, 2, 3], [4], [5, 6]], expected_shape=(3, None)),
+      dict(pylist=[[1, 2, 3], [4, 5, 6], [7, 8, 9]], expected_shape=(3, None)),
+
+      #=========================================================================
+      # 3-dimensional tensors.
+      dict(
+          pylist=[[[1, 2], [3, 4]], [], [[5, 6], [7, 8], [9, 0]]],
+          expected_shape=(3, None, None)),
+      dict(
+          pylist=[[[1, 2], [3, 4]], [], [[5, 6], [7, 8], [9, 0]]],
+          ragged_rank=1,
+          expected_shape=(3, None, 2)),
+      dict(
+          pylist=[[[1, 2], [3, 4]], [], [[5, 6], [7, 8], [9, 0]]],
+          inner_shape=(2,),
+          expected_shape=(3, None, 2)),
+      dict(
+          pylist=[[[1, 2], [3, 4]], [], [[5, 6], [7, 8], [9, 0]]],
+          ragged_rank=1,
+          inner_shape=(2,),
+          expected_shape=(3, None, 2)),
+      #=========================================================================
+      # 4-dimensional tensors.
+      dict(
+          pylist=[[[[1, 2], [3, 4]], [[5, 6], [7, 8]]],
+                  [[[2, 4], [6, 8]], [[1, 5], [7, 9]]]],
+          expected_shape=(2, None, None, None)),
+      dict(
+          pylist=[[[[1, 2], [3, 4]], [[5, 6], [7, 8]]],
+                  [[[2, 4], [6, 8]], [[1, 5], [7, 9]]]],
+          ragged_rank=1,
+          expected_shape=(2, None, 2, 2)),
+      dict(
+          pylist=[[[[1, 2], [3, 4]], [[5, 6], [7, 8]]],
+                  [[[2, 4], [6, 8]], [[1, 5], [7, 9]]]],
+          inner_shape=(2,),
+          expected_shape=(2, None, None, 2)),
+      dict(
+          pylist=[[[[1, 2], [3, 4]], [[5, 6], [7, 8]]],
+                  [[[2, 4], [6, 8]], [[1, 5], [7, 9]]]],
+          inner_shape=(2, 2),
+          expected_shape=(2, None, 2, 2)),
+
+      #=========================================================================
+      # Empty tensors (no scalar values) w/ default ragged_rank and inner_shape
+      dict(pylist=[], expected_shape=(0,)),
+      dict(pylist=[[], [], []], expected_shape=(3, None)),
+      dict(
+          pylist=[[[], []], [], [[], [[]]]],
+          expected_shape=(3, None, None, None)),
+
+      #=========================================================================
+      # Empty tensors (no scalar values) w/ explicit ragged_rank or inner_shape
+      dict(pylist=[], ragged_rank=1, expected_shape=(0, None)),
+      dict(pylist=[], ragged_rank=2, expected_shape=(0, None, None)),
+      dict(pylist=[], inner_shape=(0, 100, 20), expected_shape=(0, 100, 20)),
+      dict(
+          pylist=[],
+          ragged_rank=1,
+          inner_shape=(100, 20),
+          expected_shape=(0, None, 100, 20)),
+      dict(
+          pylist=[],
+          ragged_rank=2,
+          inner_shape=(100, 20),
+          expected_shape=(0, None, None, 100, 20)),
+      dict(pylist=[[], [], []], ragged_rank=2, expected_shape=(3, None, None)),
+      dict(pylist=[], inner_shape=(0,), expected_shape=(0,)),
+      dict(pylist=[[]], inner_shape=(1, 0), expected_shape=(1, 0)),
+
+      #=========================================================================
+      # default/inferred dtypes
+      dict(pylist=[], expected_dtype=dtypes.float32),
+      dict(pylist=[[[], [[[]], []]]], expected_dtype=dtypes.float32),
+      dict(pylist=[[1, 2], [3], [4, 5, 6]], expected_dtype=dtypes.int32),
+      dict(pylist=[[1., 2.], [], [4., 5., 6.]], expected_dtype=dtypes.float32),
+      dict(pylist=[[1, 2], [3.], [4, 5, 6]], expected_dtype=dtypes.float32),
+      dict(pylist=[[b'a', b'b'], [b'c']], expected_dtype=dtypes.string),
+      dict(pylist=[[True]], expected_dtype=dtypes.bool),
+
+      #=========================================================================
+      # explicit dtypes
+      dict(pylist=[], dtype=dtypes.float32),
+      dict(pylist=[], dtype=dtypes.string),
+      dict(pylist=[[1, 2], [3], [4, 5, 6]], dtype=dtypes.int64),
+      dict(pylist=[[1, 2], [3], [4, 5, 6]], dtype=dtypes.int32),
+      dict(pylist=[[1, 2], [3], [4, 5, 6]], dtype=dtypes.float32),
+      dict(pylist=[[1., 2.], [3.], [4., 5., 6.]], dtype=dtypes.float16),
+      dict(pylist=[[1., 2.], [3.], [4., 5., 6.]], dtype=dtypes.float32),
+      dict(pylist=[[b'a', b'b'], [b'c'], [b'd', b'e', b'f']],
+           dtype=dtypes.string),
+  )
+  def testRaggedConst(self,
+                      pylist,
+                      dtype=None,
+                      ragged_rank=None,
+                      inner_shape=None,
+                      expected_shape=None,
+                      expected_dtype=None):
+    """Tests that `ragged_const(pylist).eval().tolist() == pylist`.
+
+    Args:
+      pylist: The `pylist` argument for `ragged_const()`.
+      dtype: The `dtype` argument for `ragged_const()`.  If not None, then also
+        test that the resulting ragged tensor has this `dtype`.
+      ragged_rank: The `ragged_rank` argument for `ragged_const()`.  If not
+        None, then also test that the resulting ragged tensor has this
+        `ragged_rank`.
+      inner_shape: The `inner_shape` argument for `ragged_const()`.  If not
+        None, then also test that the resulting ragged tensor has this
+        `inner_shape`.
+      expected_shape: The expected shape for the resulting ragged tensor.
+      expected_dtype: The expected dtype for the resulting ragged tensor (used
+        to test default/inferred types when dtype=None).
+    """
+    rt = ragged_factory_ops.constant(
+        pylist, dtype=dtype, ragged_rank=ragged_rank, inner_shape=inner_shape)
+
+    # If dtype was explicitly specified, check it.
+    if dtype is not None:
+      self.assertEqual(rt.dtype, dtype)
+    if expected_dtype is not None:
+      self.assertEqual(rt.dtype, expected_dtype)
+
+    # If ragged_rank was explicitly specified, check it.
+    if ragged_rank is not None:
+      if isinstance(rt, ragged_tensor.RaggedTensor):
+        self.assertEqual(rt.ragged_rank, ragged_rank)
+      else:
+        self.assertEqual(0, ragged_rank)
+
+    # If inner_shape was explicitly specified, check it.
+    if inner_shape is not None:
+      if isinstance(rt, ragged_tensor.RaggedTensor):
+        self.assertEqual(rt.inner_values.shape.as_list()[1:], list(inner_shape))
+      else:
+        self.assertEqual(rt.shape.as_list(), list(inner_shape))
+
+    if expected_shape is not None:
+      self.assertEqual(tuple(rt.shape.as_list()), expected_shape)
+
+    with self.test_session():
+      result = rt.eval()
+      if rt.shape.ndims > 0:
+        self.assertEqual(result.tolist(), pylist)
+        if expected_shape is not None:
+          self.assertEqual(result.shape, expected_shape)
+      else:
+        self.assertEqual(result, pylist)
+        if expected_shape is not None:
+          self.assertEqual((), expected_shape)
+
+  @parameterized.parameters(
+      dict(
+          pylist=12,
+          ragged_rank=1,
+          exception=ValueError,
+          message='Invalid pylist=12: incompatible with ragged_rank=1'),
+      dict(
+          pylist=12,
+          inner_shape=(1,),
+          exception=ValueError,
+          message='Invalid pylist=12: incompatible with '
+          'dim\\(inner_shape\\)=1'),
+      dict(
+          pylist=[[[1], [2]]],
+          ragged_rank=-1,
+          exception=ValueError,
+          message='Invalid ragged_rank=-1: must be nonnegative'),
+      dict(
+          pylist=[[1, [2]]],
+          exception=ValueError,
+          message='all scalar values must have the same nesting depth'),
+      dict(
+          pylist=[[[1]], [[[2]]]],
+          exception=ValueError,
+          message='all scalar values must have the same nesting depth'),
+      dict(
+          pylist=[[1], [[]]],
+          exception=ValueError,
+          message='Invalid pylist=.*: empty list nesting is greater '
+          'than scalar value nesting'),
+      dict(
+          pylist=[1, 2, 3],
+          ragged_rank=1,
+          exception=ValueError,
+          message='pylist has scalar values depth 1, but ragged_rank=1 '
+          'requires scalar value depth greater than 1'),
+      dict(
+          pylist=[[1, 2, 3], [4, 5, 6], [7, 8, 9]],
+          ragged_rank=2,
+          exception=ValueError,
+          message='pylist has scalar values depth 2, but ragged_rank=2 '
+          'requires scalar value depth greater than 2'),
+      dict(
+          pylist=[1, 2, 3],
+          inner_shape=(1, 1),
+          exception=ValueError,
+          message='Too many elements provided.'),
+      dict(
+          pylist=[[[1, 2], [3, 4]], [[5, 6], [7, 8]]],
+          inner_shape=(2, 2),
+          ragged_rank=1,
+          exception=ValueError,
+          message='Invalid pylist=.*: incompatible with ragged_rank=1 and '
+          'dim\\(inner_shape\\)=2'),
+      dict(
+          pylist=[[[1, 2], [3, 4]], [[5, 6], [7, 8, 9]]],
+          ragged_rank=1,
+          exception=ValueError,
+          message='inner values have inconsistent shape'),
+      dict(
+          pylist=[[[], [[]]]],
+          ragged_rank=1,
+          exception=ValueError,
+          message='inner values have inconsistent shape'),
+  )
+  def testRaggedConstError(self,
+                           pylist,
+                           dtype=None,
+                           ragged_rank=None,
+                           inner_shape=None,
+                           exception=None,
+                           message=None):
+    """Tests that `ragged_const()` raises an expected exception."""
+    self.assertRaisesRegexp(
+        exception,
+        message,
+        ragged_factory_ops.constant,
+        pylist,
+        dtype=dtype,
+        ragged_rank=ragged_rank,
+        inner_shape=inner_shape)
+
+  @parameterized.parameters([
+      dict(pylist=9, scalar_depth=0, max_depth=0),
+      dict(pylist=[9], scalar_depth=1, max_depth=1),
+      dict(pylist=[1, 2, 3], scalar_depth=1, max_depth=1),
+      dict(pylist=[[1], [2]], scalar_depth=2, max_depth=2),
+      dict(pylist=[[[1], [2]], [[3]]], scalar_depth=3, max_depth=3),
+      dict(pylist=[], scalar_depth=None, max_depth=1),
+      dict(pylist=[[]], scalar_depth=None, max_depth=2),
+      dict(pylist=[[], [], []], scalar_depth=None, max_depth=2),
+      dict(pylist=[[[], []], [[], [[[]]]], []], scalar_depth=None, max_depth=5),
+      dict(
+          pylist=[1, [2]],
+          exception=ValueError,
+          message='all scalar values must have the same nesting depth'),
+      dict(
+          pylist=[[1], 2],
+          exception=ValueError,
+          message='all scalar values must have the same nesting depth'),
+      dict(
+          pylist=[[[[1]], []], [[2]]],
+          exception=ValueError,
+          message='all scalar values must have the same nesting depth'),
+  ])
+  def testScalarAndMaxDepthHelper(self,
+                                  pylist,
+                                  scalar_depth=None,
+                                  max_depth=None,
+                                  exception=None,
+                                  message=None):
+    """Tests for the _find_scalar_and_max_depth helper function."""
+    if exception is not None:
+      self.assertRaisesRegexp(
+          exception, message,
+          ragged_factory_ops._find_scalar_and_max_depth, pylist)
+    else:
+      self.assertEqual(
+          ragged_factory_ops._find_scalar_and_max_depth(pylist),
+          (scalar_depth, max_depth))
+
+  @parameterized.parameters([
+      dict(pylist=[[1], [2, 3]], ragged_rank=1, inner_shape=()),
+      dict(
+          pylist=[[[1], [2]], [[3], [4], [5]]], ragged_rank=1,
+          inner_shape=(1,)),
+      dict(pylist=[[[1], [2]], [[3], [4], [5]]], ragged_rank=2, inner_shape=()),
+      dict(
+          pylist=[[[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [2, 4, 6]]]],
+          ragged_rank=1,
+          inner_shape=(2, 3)),
+      dict(
+          pylist=[[[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [2, 4, 6]]]],
+          ragged_rank=2,
+          inner_shape=(3,)),
+      dict(
+          pylist=[[[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [2, 4, 6]]]],
+          ragged_rank=3,
+          inner_shape=()),
+      dict(
+          pylist=[[[1], [2, 3]]],
+          ragged_rank=1,
+          exception=ValueError,
+          message='inner values have inconsistent shape'),
+      dict(
+          pylist=[[[1], [[2]]]],
+          ragged_rank=1,
+          exception=ValueError,
+          message='inner values have inconsistent shape'),
+      dict(
+          pylist=[[[[1]], [2]]],
+          ragged_rank=1,
+          exception=ValueError,
+          message='inner values have inconsistent shape'),
+  ])
+  def testDefaultInnerShapeForPylistHelper(self,
+                                           pylist,
+                                           ragged_rank,
+                                           inner_shape=None,
+                                           exception=None,
+                                           message=None):
+    """Tests for the _default_inner_shape_for_pylist helper function."""
+    if exception is not None:
+      self.assertRaisesRegexp(
+          exception, message,
+          ragged_factory_ops._default_inner_shape_for_pylist, pylist,
+          ragged_rank)
+    else:
+      self.assertEqual(
+          ragged_factory_ops._default_inner_shape_for_pylist(
+              pylist, ragged_rank), inner_shape)
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/python/ops/ragged/ragged_constant_value_op_test.py b/tensorflow/python/ops/ragged/ragged_constant_value_op_test.py
new file mode 100644
index 00000000000..d80518930db
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_constant_value_op_test.py
@@ -0,0 +1,267 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for ragged.constant_value."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import ragged
+from tensorflow.python.platform import googletest
+
+
+class RaggedConstantValueOpTest(test_util.TensorFlowTestCase,
+                                parameterized.TestCase):
+
+  @parameterized.parameters(
+      #=========================================================================
+      # 0-dimensional tensors.
+      dict(pylist='x', expected_shape=()),
+
+      #=========================================================================
+      # 1-dimensional tensors.
+      dict(pylist=[1, 2, 3], expected_shape=(3,)),
+
+      #=========================================================================
+      # 2-dimensional tensors.
+      dict(pylist=[[1, 2, 3], [4], [5, 6]], expected_shape=(3, None)),
+      dict(pylist=[[1, 2, 3], [4, 5, 6], [7, 8, 9]], expected_shape=(3, None)),
+
+      #=========================================================================
+      # 3-dimensional tensors.
+      dict(
+          pylist=[[[1, 2], [3, 4]], [], [[5, 6], [7, 8], [9, 0]]],
+          expected_shape=(3, None, None)),
+      dict(
+          pylist=[[[1, 2], [3, 4]], [], [[5, 6], [7, 8], [9, 0]]],
+          ragged_rank=1,
+          expected_shape=(3, None, 2)),
+      dict(
+          pylist=[[[1, 2], [3, 4]], [], [[5, 6], [7, 8], [9, 0]]],
+          inner_shape=(2,),
+          expected_shape=(3, None, 2)),
+      dict(
+          pylist=[[[1, 2], [3, 4]], [], [[5, 6], [7, 8], [9, 0]]],
+          ragged_rank=1,
+          inner_shape=(2,),
+          expected_shape=(3, None, 2)),
+      #=========================================================================
+      # 4-dimensional tensors.
+      dict(
+          pylist=[[[[1, 2], [3, 4]], [[5, 6], [7, 8]]],
+                  [[[2, 4], [6, 8]], [[1, 5], [7, 9]]]],
+          expected_shape=(2, None, None, None)),
+      dict(
+          pylist=[[[[1, 2], [3, 4]], [[5, 6], [7, 8]]],
+                  [[[2, 4], [6, 8]], [[1, 5], [7, 9]]]],
+          ragged_rank=1,
+          expected_shape=(2, None, 2, 2)),
+      dict(
+          pylist=[[[[1, 2], [3, 4]], [[5, 6], [7, 8]]],
+                  [[[2, 4], [6, 8]], [[1, 5], [7, 9]]]],
+          inner_shape=(2,),
+          expected_shape=(2, None, None, 2)),
+      dict(
+          pylist=[[[[1, 2], [3, 4]], [[5, 6], [7, 8]]],
+                  [[[2, 4], [6, 8]], [[1, 5], [7, 9]]]],
+          inner_shape=(2, 2),
+          expected_shape=(2, None, 2, 2)),
+
+      #=========================================================================
+      # Empty tensors (no scalar values) w/ default ragged_rank and inner_shape
+      dict(pylist=[], expected_shape=(0,)),
+      dict(pylist=[[], [], []], expected_shape=(3, None)),
+      dict(
+          pylist=[[[], []], [], [[], [[]]]],
+          expected_shape=(3, None, None, None)),
+
+      #=========================================================================
+      # Empty tensors (no scalar values) w/ explicit ragged_rank or inner_shape
+      dict(pylist=[], ragged_rank=1, expected_shape=(0, None)),
+      dict(pylist=[], ragged_rank=2, expected_shape=(0, None, None)),
+      dict(pylist=[], inner_shape=(0, 100, 20), expected_shape=(0, 100, 20)),
+      dict(
+          pylist=[],
+          ragged_rank=1,
+          inner_shape=(100, 20),
+          expected_shape=(0, None, 100, 20)),
+      dict(
+          pylist=[],
+          ragged_rank=2,
+          inner_shape=(100, 20),
+          expected_shape=(0, None, None, 100, 20)),
+      dict(pylist=[[], [], []], ragged_rank=2, expected_shape=(3, None, None)),
+      dict(pylist=[], inner_shape=(0,), expected_shape=(0,)),
+      dict(pylist=[[]], inner_shape=(1, 0), expected_shape=(1, 0)),
+
+      #=========================================================================
+      # default/inferred dtypes.
+      #
+      # Note: numpy has different default/inferred types than tensorflow.
+      # Since we are using values, not tensors, we get the default numpy types
+      # here.
+      dict(pylist=[], expected_dtype=np.float64),
+      dict(pylist=[[[], [[[]], []]]], expected_dtype=np.float64),
+      dict(pylist=[[1, 2], [3], [4, 5, 6]], expected_dtype=np.int64),
+      dict(pylist=[[1., 2.], [], [4., 5., 6.]], expected_dtype=np.float64),
+      dict(pylist=[[1, 2], [3.], [4, 5, 6]], expected_dtype=np.float64),
+      dict(pylist=[[b'a', b'b'], [b'c']], expected_dtype=np.dtype('S1')),
+      dict(pylist=[[True]], expected_dtype=np.bool),
+
+      #=========================================================================
+      # explicit dtypes
+      dict(pylist=[], dtype=np.float32),
+      dict(pylist=[], dtype=np.dtype('S1')),
+      dict(pylist=[[1, 2], [3], [4, 5, 6]], dtype=np.int64),
+      dict(pylist=[[1, 2], [3], [4, 5, 6]], dtype=np.int32),
+      dict(pylist=[[1, 2], [3], [4, 5, 6]], dtype=np.float32),
+      dict(pylist=[[1., 2.], [3.], [4., 5., 6.]], dtype=np.float16),
+      dict(pylist=[[1., 2.], [3.], [4., 5., 6.]], dtype=np.float32),
+      dict(
+          pylist=[[b'a', b'b'], [b'c'], [b'd', b'e', b'f']],
+          dtype=np.dtype('S1')),
+  )
+  def testRaggedValues(self,
+                       pylist,
+                       dtype=None,
+                       ragged_rank=None,
+                       inner_shape=None,
+                       expected_shape=None,
+                       expected_dtype=None):
+    """Tests that `ragged_value(pylist).tolist() == pylist`."""
+    rt = ragged.constant_value(
+        pylist, dtype=dtype, ragged_rank=ragged_rank, inner_shape=inner_shape)
+
+    # If dtype was explicitly specified, check it.
+    if dtype is not None:
+      self.assertEqual(rt.dtype, dtype)
+    if expected_dtype is not None:
+      self.assertEqual(rt.dtype, expected_dtype)
+
+    # If ragged_rank was explicitly specified, check it.
+    if ragged_rank is not None:
+      if isinstance(rt, ragged.RaggedTensorValue):
+        self.assertEqual(rt.ragged_rank, ragged_rank)
+      else:
+        self.assertEqual(0, ragged_rank)
+
+    # If inner_shape was explicitly specified, check it.
+    if inner_shape is not None:
+      if isinstance(rt, ragged.RaggedTensorValue):
+        self.assertEqual(rt.inner_values.shape[1:], inner_shape)
+      else:
+        self.assertEqual(rt.shape, inner_shape)
+
+    if expected_shape is not None:
+      self.assertEqual(tuple(rt.shape), expected_shape)
+
+    if rt.shape:
+      self.assertEqual(rt.tolist(), pylist)
+      if expected_shape is not None:
+        self.assertEqual(rt.shape, expected_shape)
+    else:
+      self.assertEqual(rt, pylist)
+      if expected_shape is not None:
+        self.assertEqual((), expected_shape)
+
+  @parameterized.parameters(
+      dict(
+          pylist=12,
+          ragged_rank=1,
+          exception=ValueError,
+          message='Invalid pylist=12: incompatible with ragged_rank=1'),
+      dict(
+          pylist=12,
+          inner_shape=(1,),
+          exception=ValueError,
+          message='Invalid pylist=12: incompatible with '
+          'dim\\(inner_shape\\)=1'),
+      dict(
+          pylist=[[[1], [2]]],
+          ragged_rank=-1,
+          exception=ValueError,
+          message='Invalid ragged_rank=-1: must be nonnegative'),
+      dict(
+          pylist=[[1, [2]]],
+          exception=ValueError,
+          message='all scalar values must have the same nesting depth'),
+      dict(
+          pylist=[[[1]], [[[2]]]],
+          exception=ValueError,
+          message='all scalar values must have the same nesting depth'),
+      dict(
+          pylist=[[1], [[]]],
+          exception=ValueError,
+          message='Invalid pylist=.*: empty list nesting is greater '
+          'than scalar value nesting'),
+      dict(
+          pylist=[1, 2, 3],
+          ragged_rank=1,
+          exception=ValueError,
+          message='pylist has scalar values depth 1, but ragged_rank=1 '
+          'requires scalar value depth greater than 1'),
+      dict(
+          pylist=[[1, 2, 3], [4, 5, 6], [7, 8, 9]],
+          ragged_rank=2,
+          exception=ValueError,
+          message='pylist has scalar values depth 2, but ragged_rank=2 '
+          'requires scalar value depth greater than 2'),
+      dict(
+          pylist=[1, 2, 3],
+          inner_shape=(1, 1),
+          exception=ValueError,
+          message='cannot reshape array'),
+      dict(
+          pylist=[[[1, 2], [3, 4]], [[5, 6], [7, 8]]],
+          inner_shape=(2, 2),
+          ragged_rank=1,
+          exception=ValueError,
+          message='Invalid pylist=.*: incompatible with ragged_rank=1 and '
+          'dim\\(inner_shape\\)=2'),
+      dict(
+          pylist=[[[1, 2], [3, 4]], [[5, 6], [7, 8, 9]]],
+          ragged_rank=1,
+          exception=ValueError,
+          message='inner values have inconsistent shape'),
+      dict(
+          pylist=[[[], [[]]]],
+          ragged_rank=1,
+          exception=ValueError,
+          message='inner values have inconsistent shape'),
+  )
+  def testRaggedValuesError(self,
+                            pylist,
+                            dtype=None,
+                            ragged_rank=None,
+                            inner_shape=None,
+                            exception=None,
+                            message=None):
+    """Tests that `ragged.constant_value()` raises an expected exception."""
+    self.assertRaisesRegexp(
+        exception,
+        message,
+        ragged.constant_value,
+        pylist,
+        dtype=dtype,
+        ragged_rank=ragged_rank,
+        inner_shape=inner_shape)
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/python/ops/ragged/ragged_conversion_ops.py b/tensorflow/python/ops/ragged/ragged_conversion_ops.py
new file mode 100644
index 00000000000..3ec246ccaf1
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_conversion_ops.py
@@ -0,0 +1,409 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Ops to convert between RaggedTensors and other tensor types."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import gen_ragged_conversion_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.ragged import ragged_factory_ops
+from tensorflow.python.ops.ragged import ragged_tensor
+from tensorflow.python.ops.ragged import ragged_util
+
+
+#===============================================================================
+# RaggedTensor <-> Tensor conversion
+#===============================================================================
+def from_tensor(tensor, lengths=None, padding=None, ragged_rank=1, name=None):
+  """Converts a `Tensor` into a `RaggedTensor`.
+
+  The set of absent/default values may be specified using a vector of lengths
+  or a padding value (but not both).  If `lengths` is specified, then the
+  output tensor will satisfy `output[row] = tensor[row][:lengths[row]]`.
+  If `padding` is specified, then any row *suffix* consisting entirely of
+  `padding` will be excluded from the returned `RaggedTensor`.  If neither
+  `lengths` nor `padding` is specified, then the returned `RaggedTensor` will
+  have no absent/default values.
+
+  Examples:
+
+  ```python
+  >>> dt = tf.constant([[5, 7, 0], [0, 3, 0], [6, 0, 0]])
+  >>> ragged.from_tensor(dt).eval().tolist()
+  [[5, 7, 0], [0, 3, 0], [6, 0, 0]]
+  >>> ragged.from_tensor(dt, lengths=[2, 0, 3]).eval().tolist()
+  [[5, 7], [], [6, 0, 0]]
+  >>> ragged.from_tensor(dt, padding=0).eval().tolist()
+  [[5, 7], [0, 3], [6]]
+  ```
+
+  Args:
+    tensor: The `Tensor` to convert.  Must have rank `ragged_rank + 1` or
+      higher.
+    lengths: An optional set of row lengths, specified using a 1-D integer
+      `Tensor` whose length is equal to `tensor.shape[0]` (the number of rows in
+      `tensor`).  If specified, then `output[row]` will contain
+      `tensor[row][:lengths[row]]`.  Negative lengths are treated as zero.
+    padding: An optional padding value.  If specified, then any row suffix
+      consisting entirely of `padding` will be excluded from the returned
+      RaggedTensor.  `padding` is a `Tensor` with the same dtype as `tensor`
+      and with `shape=tensor.shape[ragged_rank + 1:]`.
+    ragged_rank: Integer specifying the ragged rank for the returned
+      `RaggedTensor`.  Must be greater than zero.
+    name: A name prefix for the returned tensors (optional).
+
+  Returns:
+    A `RaggedTensor` with the specified `ragged_rank`.  The shape of the
+    returned ragged tensor is compatible with the shape of `tensor`.
+  Raises:
+    ValueError: If both `lengths` and `padding` are specified.
+  """
+  if lengths is not None and padding is not None:
+    raise ValueError('Specify lengths or padding, but not both')
+  if not isinstance(ragged_rank, int):
+    raise TypeError('ragged_rank expected int, got %r' % ragged_rank)
+  if ragged_rank <= 0:
+    raise ValueError('ragged_rank must be greater than 0; got %s' % ragged_rank)
+
+  with ops.name_scope(name, 'RaggedFromTensor', [tensor, lengths, padding]):
+    tensor = ops.convert_to_tensor(tensor, name='tensor')
+    tensor.shape.with_rank_at_least(ragged_rank + 1)
+    input_shape = array_ops.shape(tensor, out_type=dtypes.int64)
+    ncols = input_shape[1]
+
+    # Handle ragged_rank>1 via recursion:
+    # If the output should have multiple ragged dimensions, then first
+    # flatten the tensor to eliminate all but the last ragged dimension,
+    # and recursively convert that flattened tensor.  Then add on the splits
+    # for the dimensions that we flattened out.
+    if ragged_rank > 1:
+      # Flatten `tensor` to eliminate all but the last ragged dimension.
+      new_shape = array_ops.concat(
+          [constant_op.constant([-1], dtypes.int64), input_shape[ragged_rank:]],
+          axis=0)
+      flattened = array_ops.reshape(tensor, new_shape)
+      # Recursively convert the flattened tensor.
+      values = from_tensor(flattened, lengths, padding)
+      # The total number of elements in each  dimension.  E.g., if
+      # input_shape=[3, 4, 5, 6], then dim[2] has 3*4*5 elements in total.
+      dim_size = math_ops.cumprod(input_shape)
+      # Construct splits tensors for the dimensions that were flattened.
+      new_splits = [
+          math_ops.range(0, dim_size[dim - 1] + 1) * input_shape[dim]
+          for dim in range(1, ragged_rank)
+      ]
+      return ragged_factory_ops.from_nested_row_splits(values, new_splits)
+
+    # If padding was specified, then use it to find row lengths.
+    if padding is not None:
+      padding = ops.convert_to_tensor(
+          padding, name='padding', dtype=tensor.dtype)
+      padding.shape.assert_is_compatible_with(tensor.shape[2:])
+
+      # Find places where the padding is equal to the tensor.  (This will
+      # broadcast `padding` across the outermost 2 dimensions of `tensor`,
+      # so `has_default_value.shape = tensor.shape`.)
+      has_default_value = math_ops.equal(padding, tensor)
+
+      # If the padding isn't a scalar, then require that all values in the
+      # padding match each item in the tensor.  After this block of code,
+      # `has_default.shape = tensor.shape[:2]`.  (Unfortunately, we can't just
+      # use reduce_all for both cases, becaue when you pass an empty `axis`
+      # list to reduce_all, it reduces all axes; but we want it to reduce no
+      # axes -- i.e., to be a no-op.)
+      tensor_rank = array_ops.rank(tensor)
+      reduce_axis = math_ops.range(2, tensor_rank)
+      has_default = control_flow_ops.cond(
+          tensor_rank > 2,
+          lambda: math_ops.reduce_all(has_default_value, axis=reduce_axis),
+          lambda: has_default_value)
+      has_default.set_shape(tensor_shape.TensorShape([None, None]))
+      has_default.set_shape(tensor.shape[:2])
+
+      # Use has_default it to find the length of each row: for each non-default
+      # item in a row, calculate the length that the row needs to have to
+      # include that item; and then take the max of those values (across each
+      # row).
+      has_nondefault = math_ops.logical_not(has_default)
+      has_nondefault = math_ops.cast(has_nondefault, dtypes.int64)
+      length_for_nondefault_value = (
+          has_nondefault * array_ops.expand_dims(
+              math_ops.range(1, ncols + 1), 0))
+      lengths = math_ops.reduce_max(length_for_nondefault_value, axis=1)
+
+    # If we have lengths (either directly supplied, or computed from paddings),
+    # then use those to construct splits; and then use masking to get the
+    # corresponding values.
+    if lengths is not None:
+      lengths = ragged_util.convert_to_int_tensor(lengths, 'lengths',
+                                                  dtypes.int64)
+      lengths.shape.assert_has_rank(1)
+      lengths = math_ops.minimum(lengths, ncols)
+      lengths = math_ops.maximum(lengths, 0)
+      limits = math_ops.cumsum(lengths)
+      splits = array_ops.concat(
+          [array_ops.zeros([1], dtypes.int64), limits], axis=0)
+      mask = array_ops.sequence_mask(lengths, maxlen=ncols)
+      values = array_ops.boolean_mask(tensor, mask)
+      return ragged_factory_ops.from_row_splits(values, splits)
+
+    # If neither padding nor lengths were specified, then create a splits
+    # vector that contains no default values, and reshape the input tensor
+    # to form the values for the RaggedTensor.
+    nrows = input_shape[0]
+    nvals = nrows * ncols
+    splits = math_ops.range(nrows + 1) * ncols
+    values_shape = array_ops.concat([[nvals], input_shape[2:]], axis=0)
+    values = array_ops.reshape(tensor, values_shape)
+    return ragged_factory_ops.from_row_splits(values, splits)
+
+
+def to_tensor(rt_input, default_value=None, name=None):
+  """Converts a `RaggedTensor` into a `Tensor`.
+
+  Example:
+
+  ```python
+  >>> rt = ragged.constant([[9, 8, 7], [], [6, 5], [4]])
+  >>> print ragged.to_tensor(rt).eval()
+  [[9 8 7]
+   [0 0 0]
+   [6 5 0]
+   [4 0 0]]
+  ```
+
+  Args:
+    rt_input: The input `RaggedTensor`.
+    default_value: Value to set for indices not specified in `rt_input`.
+      Defaults to zero.  `default_value.shape` must be equal to
+      `rt_input.shape[rt_input.ragged_rank + 1:]`.
+    name: A name prefix for the returned tensors (optional).
+
+  Returns:
+    A `Tensor` with shape `ragged.bounding_shape(rt_input)` and the
+    values specified by the non-empty values in `rt_input`.  Empty values are
+    assigned `default_value`.
+  """
+  with ops.name_scope(name, 'RaggedToTensor', [rt_input, default_value]):
+    rt_input = ragged_factory_ops.convert_to_tensor_or_ragged_tensor(
+        rt_input, name='rt_input')
+    if not ragged_tensor.is_ragged(rt_input):
+      return rt_input  # already dense
+
+    # If ragged_rank > 1, then recursively convert the ragged values into a
+    # `Tensor` before we proceed.
+    values = rt_input.values
+    if ragged_tensor.is_ragged(values):
+      values = to_tensor(values, default_value)
+
+    # Get the expected dense shape ([nrows, ncols] + value_shape).
+    rt_row_lengths = [rt_input.row_splits[1:] - rt_input.row_splits[:-1]]
+    nrows = array_ops.shape(rt_input.row_splits, out_type=dtypes.int64)[0] - 1
+    ncols = math_ops.maximum(math_ops.reduce_max(rt_row_lengths), 0)
+    values_shape = array_ops.shape(values, out_type=dtypes.int64)
+    value_shape = values_shape[1:]
+    nvals = values_shape[0]
+
+    # Build a default value if none was supplied.
+    if default_value is None:
+      default_value = array_ops.zeros(value_shape, dtype=values.dtype)
+    else:
+      default_value = ops.convert_to_tensor(
+          default_value, name='default_value', dtype=values.dtype)
+    default_value.shape.assert_is_compatible_with(values.shape[1:])
+    default_value.set_shape(values.shape[1:])
+
+    # Get the row start indices, and expand to shape=[nrows, 1].
+    starts = array_ops.expand_dims(rt_input.row_splits[:-1], 1)
+
+    # Get the row limit indices, and expand to shape=[nrows, 1].
+    limits = array_ops.expand_dims(rt_input.row_splits[1:], 1)
+
+    # Get the column indices, and expand to shape=[1, ncols].
+    columns = array_ops.expand_dims(math_ops.range(0, ncols), 0)
+
+    # Build a list containing the values plus the default value.  We will use
+    # tf.gather to collect values from this list for the `Tensor` (using
+    # nvals as the index for the default value).
+    values_and_default = array_ops.concat(
+        [values, array_ops.stack([default_value])], axis=0)
+
+    # Construct a matrix "indices" pointing into values_and_default.  I.e.,
+    # output[r, c] = values_and_default[indices[r, c].
+    nondefault_index = starts + columns
+    has_value = nondefault_index < limits
+    default_index = array_ops.fill(array_ops.stack([nrows, ncols]), nvals)
+    indices = array_ops.where(has_value, nondefault_index, default_index)
+
+    # Gather the results into a `Tensor`.
+    return array_ops.gather(values_and_default, indices)
+
+
+#===============================================================================
+# RaggedTensor <-> SparseTensor conversion
+#===============================================================================
+def to_sparse(rt_input, name=None):
+  """Converts a `RaggedTensor` into a sparse tensor.
+
+  Example:
+
+  ```python
+  >>> rt = ragged.constant([[1, 2, 3], [4], [], [5, 6]])
+  >>> ragged.to_sparse(rt).eval()
+  SparseTensorValue(indices=[[0, 0], [0, 1], [0, 2], [1, 0], [3, 0], [3, 1]],
+                    values=[1, 2, 3, 4, 5, 6],
+                    dense_shape=[4, 3])
+  ```
+
+  Args:
+    rt_input: The input `RaggedTensor`.
+    name: A name prefix for the returned tensors (optional).
+
+  Returns:
+    A SparseTensor with the same values as `rt_input`.
+  """
+  if not ragged_tensor.is_ragged(rt_input):
+    raise TypeError('Expected RaggedTensor, got %s' % type(rt_input).__name__)
+  with ops.name_scope(name, 'RaggedToSparse', [rt_input]):
+    rt_input = ragged_factory_ops.convert_to_tensor_or_ragged_tensor(
+        rt_input, name='rt_input')
+    result = gen_ragged_conversion_ops.ragged_tensor_to_sparse(
+        rt_input.nested_row_splits, rt_input.inner_values, name=name)
+    return sparse_tensor.SparseTensor(
+        result.sparse_indices, result.sparse_values, result.sparse_dense_shape)
+
+
+@ops.RegisterGradient('RaggedTensorToSparse')
+def _ragged_tensor_to_sparse_gradient(op, unused_sparse_indices_grad,
+                                      sparse_values_grad,
+                                      unused_sparse_shape_grad):
+  """Gradient for ragged.to_sparse."""
+  op_inputs_nested_row_splits = op.inputs[:-1]
+  op_inputs_inner_values = op.inputs[-1]
+
+  # No gradient for the RaggedTensor's nested_row_splits.
+  nested_row_splits_gradient = [None] * len(op_inputs_nested_row_splits)
+
+  # Gradient for the RaggedTensor's inner_values is formed by reshaping
+  # the gradient for the SparseTensor's values.
+  inner_values_shape = array_ops.shape(op_inputs_inner_values)
+  inner_values_gradient = array_ops.reshape(sparse_values_grad,
+                                            inner_values_shape)
+
+  return nested_row_splits_gradient + [inner_values_gradient]
+
+
+def from_sparse(st_input, name=None):
+  """Converts a 2D `SparseTensor` to a `RaggedTensor`.
+
+  Each row of the `output` `RaggedTensor` will contain the explicit values from
+  the same row in `st_input`.  `st_input` must be ragged-right.  If not it is
+  not ragged-right, then an error will be generated.
+
+  Example:
+
+  ```python
+  >>> st = SparseTensor(indices=[[0, 1], [0, 2], [0, 3], [1, 0], [3, 0]],
+  ...                   values=[1, 2, 3, 4, 5],
+  ...                   dense_shape=[4, 3])
+  >>> ragged.from_sparse(st).eval().tolist()
+  [[1, 2, 3], [4], [], [5]]
+  ```
+
+  Currently, only two-dimensional `SparseTensors` are supported.
+
+  Args:
+    st_input: The sparse tensor to convert.  Must have rank 2.
+    name: A name prefix for the returned tensors (optional).
+
+  Returns:
+    A `RaggedTensor` with the same values as `st_input`.
+    `output.ragged_rank = rank(st_input) - 1`.
+    `output.shape = [st_input.dense_shape[0], None]`.
+  Raises:
+    ValueError: If the number of dimensions in `st_input` is not known
+      statically, or is not two.
+  """
+  if not sparse_tensor.is_sparse(st_input):
+    raise TypeError('Expected SparseTensor, got %s' % type(st_input).__name__)
+  with ops.name_scope(name, 'RaggedFromSparse', [st_input]):
+    st_input = sparse_tensor.convert_to_tensor_or_sparse_tensor(
+        st_input, name='rt_input')
+
+    if (st_input.dense_shape.shape.ndims != 2 and
+        st_input.indices.shape.ndims is None or
+        st_input.indices.shape.dims[1].value != 2):
+      raise ValueError('rank(st_input) must be 2')
+
+    with ops.control_dependencies(
+        _assert_sparse_indices_are_ragged_right(st_input.indices)):
+      # Treat sparse row indices as segment ids to generate a splits tensor that
+      # we can pair with the sparse tensor values.  (Ignore sparse column
+      # indices.)
+      segment_ids = st_input.indices[:, 0]
+      num_segments = st_input.dense_shape[0]
+      return ragged_factory_ops.from_value_rowids(st_input.values, segment_ids,
+                                                  num_segments)
+
+
+def _assert_sparse_indices_are_ragged_right(indices):
+  """Checks that the given SparseTensor.indices tensor is ragged-right.
+
+  Example: `indices = [[0, 0], [0, 1], [2, 0], [3, 1]]` is not ragged right
+  because the entry `[3, 1]` skips a cell.
+
+  Args:
+    indices: The SparseTensor indices to check.
+
+  Returns:
+    A list of control dependency op tensors.
+  """
+  index_prefix = indices[:, :-1]
+  index_suffix = indices[:, -1]
+
+  # Check whether each index is starting a new row in the innermost dimension
+  # (prefix[i] != prefix[i-1]) or continuing a row (prefix[i] == prefix[i-1]).
+  # (Note: this skips the first index; we will check that separately below.)
+  index_prefix_changed = math_ops.reduce_any(
+      math_ops.not_equal(index_prefix[1:], index_prefix[:-1]), axis=1)
+
+  # Check two cases:
+  #   * For indices that start a new row: index_suffix[i] must be zero.
+  #   * For indices that continue a row: index_suffix[i] must be equal to
+  #     index_suffix[i-1]+1.
+  index_ok = array_ops.where(
+      index_prefix_changed, math_ops.equal(index_suffix[1:], 0),
+      math_ops.equal(index_suffix[1:], index_suffix[:-1] + 1))
+
+  # Also check that the very first index didn't skip any cells.  The first
+  # index starts a new row (by definition), so its suffix should be zero.
+  sparse_indices_are_ragged_right = math_ops.logical_and(
+      math_ops.reduce_all(math_ops.equal(index_suffix[:1], 0)),
+      math_ops.reduce_all(index_ok))
+
+  message = [
+      'SparseTensor is not right-ragged',
+      'SparseTensor.indices =', indices
+  ]
+  return [control_flow_ops.Assert(sparse_indices_are_ragged_right, message)]
diff --git a/tensorflow/python/ops/ragged/ragged_eager_test.py b/tensorflow/python/ops/ragged/ragged_eager_test.py
new file mode 100644
index 00000000000..731ff742aa1
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_eager_test.py
@@ -0,0 +1,58 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for tf.ragged in eager execution mode."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import sys
+
+from absl.testing import parameterized
+
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import ragged
+from tensorflow.python.platform import googletest
+
+
+class RaggedTensorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
+
+  @parameterized.parameters([
+      dict(pylist=[[b'a', b'b'], [b'c']]),
+      dict(pylist=[[[1, 2], [3]], [[4, 5, 6], [], [7]]]),
+      dict(pylist=[[[1, 2], [3, 4]], [[5, 6], [], [7, 8]]], ragged_rank=1),
+  ])
+  def testRaggedTensorToList(self, pylist, ragged_rank=None):
+    rt = ragged.constant(pylist, ragged_rank)
+    self.assertEqual(rt.tolist(), pylist)
+
+  expected = "RaggedTensor([['a', 'b'], ['c']])"
+  if sys.version_info[0] == 3:
+    expected = "RaggedTensor([[b'a', b'b'], [b'c']])"
+
+  @parameterized.parameters([
+      dict(pylist=[['a', 'b'], ['c']],
+           expected=expected),
+      dict(pylist=[[[1, 2], [3]], [[4, 5, 6], [], [7]]],
+           expected='RaggedTensor([[[1, 2], [3]], [[4, 5, 6], [], [7]]])'),
+  ])
+  def testRaggedTensorStr(self, pylist, expected):
+    rt = ragged.constant(pylist)
+    self.assertEqual(str(rt), expected)
+
+
+if __name__ == '__main__':
+  ops.enable_eager_execution()
+  googletest.main()
diff --git a/tensorflow/python/ops/ragged/ragged_elementwise_ops.py b/tensorflow/python/ops/ragged/ragged_elementwise_ops.py
new file mode 100644
index 00000000000..23d0e8b5fc4
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_elementwise_ops.py
@@ -0,0 +1,367 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Elementwise operations for RaggedTensors."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import clip_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import parsing_ops
+from tensorflow.python.ops import string_ops
+from tensorflow.python.ops.ragged import ragged_factory_ops
+from tensorflow.python.ops.ragged import ragged_tensor
+from tensorflow.python.ops.ragged import ragged_util
+from tensorflow.python.util import tf_decorator
+from tensorflow.python.util import tf_export
+from tensorflow.python.util import tf_inspect
+
+# Information about an argument to an operation: The name of the argument, its
+# position in the argument list, and a boolean flag indicating whether it
+# expects a list of tensors.
+_ArgInfo = collections.namedtuple('ArgInfo', ['name', 'position', 'is_list'])
+
+
+def make_elementwise_op(op, *elementwise_args):
+  """Returns a ragged-tensor version of the elementwise operation `op`.
+
+  The returned operation will:
+
+  1. Broadcast the elementwise arguments to have a compatible shape.
+     An exception is raised if the tensors not broadcast-compatible.
+  2. Call `op`, substituting the dense values of the broadcasted tensor for
+     each elementwise argument.
+  3. Return a potentially ragged tensor constructed from the output of `op`
+     and the broadcasted tensors' nested row splits.
+
+  For example, you can construct a ragged-tensor version of the standard
+  operation `tf.add` by calling `make_elementwise_op(tf.add, 'x', 'y')`.
+
+  Args:
+    op: The operation to wrap.
+    *elementwise_args: The names of arguments to `op` that are treated as
+      elementwise.  Arguments that take a list of tensors should have their
+      names wrapped in square brackets (e.g. "[inputs]").
+
+  Raises:
+    ValueError: If any name specified in `elementwise_args` is not the name
+      of an argument to `op`.
+  """
+  elementwise_arg_infos = _get_arg_infos(op, elementwise_args)
+
+  def ragged_op(*args, **kwargs):
+    """Ragged version of `op`."""
+    args = list(args)
+
+    # Collect all of the elementwise arguments, and put them in a single
+    # dict whose values are the (potentially ragged) tensors that need to
+    # be broadcast to a common shape.  The keys of this dict are tuples
+    # (argkey, index), where argkey is an int for poitional args or a string
+    # for keyword args; and index is None for non-list args and the index of the
+    # tensor for list args.
+    elementwise_args = {}
+    for (name, position, is_list) in elementwise_arg_infos.values():
+      if position < len(args):
+        if is_list:
+          args[position] = list(args[position])
+          for (index, arg) in enumerate(args[position]):
+            elementwise_args[position, index] = arg
+        else:
+          elementwise_args[position, None] = args[position]
+      elif name in kwargs:
+        if is_list:
+          kwargs[name] = list(kwargs[name])
+          for (i, arg) in enumerate(kwargs[name]):
+            elementwise_args[name, i] = arg
+        else:
+          elementwise_args[name, None] = kwargs[name]
+
+    with ops.name_scope(None, op.__name__, elementwise_args.values()):
+      # Convert all inputs to tensors or ragged tensors.
+      for ((key, index), tensor) in elementwise_args.items():
+        argname = elementwise_arg_infos[key].name
+        converted = ragged_factory_ops.convert_to_tensor_or_ragged_tensor(
+            tensor, name=argname)
+        elementwise_args[key, index] = converted
+
+      # Broadcast tensors to have compatible shapes.
+      broadcast_args, result_splits, broadcast_check_ops = \
+          _broadcast_elementwise_args(elementwise_args)
+
+      # Replace tensor arguments with their dense values.
+      for ((key, index), tensor) in broadcast_args.items():
+        if ragged_tensor.is_ragged(tensor):
+          if isinstance(key, int) and index is None:
+            args[key] = tensor.inner_values
+          elif isinstance(key, int) and index is not None:
+            args[key][index] = tensor.inner_values
+          elif isinstance(key, str) and index is None:
+            kwargs[key] = tensor.inner_values
+          else:
+            assert isinstance(key, str) and index is not None
+            kwargs[key][index] = tensor.inner_values
+
+      # Call the elementwise op on the broadcasted dense values.
+      with ops.control_dependencies(broadcast_check_ops):
+        result_values = op(*args, **kwargs)
+
+      # Restore any ragged dimensions that we stripped off, and return the
+      # result.
+      return ragged_factory_ops.from_nested_row_splits(result_values,
+                                                       result_splits)
+
+  # Construct the docstring.
+  op_name = tf_export.get_canonical_name_for_symbol(op)
+  assert op_name is not None, op
+  argnames = ', '.join('`%s`' % s.strip('[]') for s in elementwise_args)
+  docstring = _ELEMENTWISE_DOCSTRING % dict(op_name=op_name, argnames=argnames)
+
+  # Update name, docstring, signature, etc., for the wrapper, and return it.
+  return tf_decorator.make_decorator(op, ragged_op, decorator_doc=docstring)
+
+
+_ELEMENTWISE_DOCSTRING = """\
+Ragged version of the elementwise operation `tf.%(op_name)s`.
+
+  The following elementwise arguments may be ragged or dense:
+  %(argnames)s.
+  These arguments will be broadcast to a compatible shape if necessary.
+  """
+
+
+def _get_arg_infos(func, elementwise_args):
+  """Returns `_ArgInfo`s for each `func` arg specified by `elementwise_args`.
+
+  Args:
+    func: The function whose arguments should be described.
+    elementwise_args: The names of the arguments to get info for.
+
+  Returns:
+    A dictionary that maps both names and positions of arguments to
+    `_ArgInfo` tuples.
+  """
+  arg_infos = {}
+
+  # Inspect the func's argspec to find the position of each arg.
+  arg_spec = tf_inspect.getargspec(func)
+  for argname in elementwise_args:
+    assert isinstance(argname, str)
+    is_list = argname.startswith('[') and argname.endswith(']')
+    if is_list:
+      argname = argname[1:-1]
+    assert argname in arg_spec.args, (func, argname, arg_spec.args)
+    arg_info = _ArgInfo(argname, arg_spec.args.index(argname), is_list)
+    arg_infos[arg_info.name] = arg_info
+    arg_infos[arg_info.position] = arg_info
+  return arg_infos
+
+
+def _broadcast_elementwise_args(elementwise_args):
+  """Broadcasts the values of `elementwise_args` to have compatible shapes.
+
+  Args:
+    elementwise_args: A dictionary whose keys are potentially ragged tensors.
+
+  Returns:
+    A tuple `(broadcast_args, broadcast_splits, checks)` where:
+
+    * `broadcast_args` is a dictionary with the same keys as
+      `elementwise_args`, mapping to broadcasted tensors.
+    * `broadcast_splits` is the broadcasted nested row splits.
+    * `checks` is a possibly empty tuple of assertion operations that should
+      be added as control dependencies.
+
+  Raises:
+    ValueError: If broadcasting fails.
+  """
+  # No elementwise arguments were used: nothing to do!
+  if not elementwise_args:
+    return elementwise_args, (), ()
+
+  # A single elementwise argument was used: no broadcasting necessary.
+  if len(elementwise_args) == 1:
+    arg = list(elementwise_args.values())[0]
+    if ragged_tensor.is_ragged(arg):
+      return elementwise_args, arg.nested_row_splits, ()
+    else:
+      return elementwise_args, (), ()
+
+  # Multiple elementwise arguments.
+  else:
+    is_ragged = [ragged_tensor.is_ragged(t) for t in elementwise_args.values()]
+    if not any(is_ragged):
+      return elementwise_args, (), ()
+
+    # Support limited broadcasting (namely, scalar + ragged).  Full
+    # broadcasting support will be added later.
+    if all((ragged_tensor.is_ragged(t) or t.shape.ndims == 0)
+           for t in elementwise_args.values()):
+      nested_splits_lists = [
+          t.nested_row_splits
+          for t in elementwise_args.values()
+          if ragged_tensor.is_ragged(t)
+      ]
+      if len(nested_splits_lists) == 1:
+        checks = ()
+      else:
+        if any(t.shape.ndims is None for t in elementwise_args.values()):
+          raise ValueError('Ragged elementwise ops require that rank (number '
+                           'of dimensions) be statically known.')
+        if len(set(t.shape.ndims for t in elementwise_args.values())) != 1:
+          raise ValueError('Ragged elementwise ops do not support '
+                           'broadcasting yet')
+        checks = ragged_util.assert_splits_match(nested_splits_lists)
+      return (elementwise_args, nested_splits_lists[0], checks)
+    else:
+      raise ValueError('Ragged elementwise ops do not support broadcasting yet')
+
+
+# A list of symbols that should be exported in the "ragged" package.
+_symbols_to_export = []
+
+
+def _add_elementwise_ops_to_this_module(specs, verbose=False):
+  """Adds ragged versions of the given ops to this module.
+
+  Args:
+    specs: A list of tuples containing the arguments for `make_elementwise_op`.
+    verbose: If true, then display each op that gets added.
+  """
+  for spec in specs:
+    original_op = spec[0]
+    ragged_op = make_elementwise_op(*spec)
+    canonical_name = tf_export.get_canonical_name_for_symbol(original_op)
+    if '.' not in canonical_name:
+      op_name = canonical_name
+    else:
+      op_name = original_op.__name__
+    if verbose:
+      print('Adding ragged_elementwise_op: tf.ragged.%s (based on tf.%s)' %
+            (op_name, canonical_name))
+    globals()[op_name] = ragged_op
+    _symbols_to_export.append(op_name)
+
+
+# A list of tuples containing arguments for `make_elementwise_op`, for each
+# elementwise operation that should have a ragged version built.  Each tuple
+# contains a standard `Tensor` operation, and the names of any arguments
+# that are processed in elementwise fashion.
+_TF_ELEMENTWISE_OPS = [
+    # Unary math operations.
+    (clip_ops.clip_by_value, 't'),
+    (math_ops.abs, 'x'),
+    (math_ops.acos, 'x'),
+    (math_ops.acosh, 'x'),
+    (math_ops.angle, 'input'),
+    (math_ops.asin, 'x'),
+    (math_ops.asinh, 'x'),
+    (math_ops.atan, 'x'),
+    (math_ops.atanh, 'x'),
+    (math_ops.cast, 'x'),
+    (math_ops.ceil, 'x'),
+    (math_ops.conj, 'x'),
+    (math_ops.cos, 'x'),
+    (math_ops.cosh, 'x'),
+    (math_ops.digamma, 'x'),
+    (math_ops.erf, 'x'),
+    (math_ops.erfc, 'x'),
+    (math_ops.exp, 'x'),
+    (math_ops.expm1, 'x'),
+    (math_ops.floor, 'x'),
+    (math_ops.imag, 'input'),
+    (math_ops.is_finite, 'x'),
+    (math_ops.is_inf, 'x'),
+    (math_ops.is_nan, 'x'),
+    (math_ops.lgamma, 'x'),
+    (math_ops.log, 'x'),
+    (math_ops.log1p, 'x'),
+    (math_ops.log_sigmoid, 'x'),
+    (math_ops.logical_not, 'x'),
+    (math_ops.negative, 'x'),
+    (math_ops.real, 'input'),
+    (math_ops.reciprocal, 'x'),
+    (math_ops.rint, 'x'),
+    (math_ops.round, 'x'),
+    (math_ops.rsqrt, 'x'),
+    (math_ops.saturate_cast, 'value'),
+    (math_ops.sign, 'x'),
+    (math_ops.sin, 'x'),
+    (math_ops.sinh, 'x'),
+    (math_ops.sqrt, 'x'),
+    (math_ops.square, 'x'),
+    (math_ops.tan, 'x'),
+
+    # Binary math operations
+    (math_ops.add, 'x', 'y'),
+    (math_ops.atan2, 'y', 'x'),
+    (math_ops.complex, 'real', 'imag'),
+    (math_ops.div, 'x', 'y'),
+    (math_ops.div_no_nan, 'x', 'y'),
+    (math_ops.divide, 'x', 'y'),
+    (math_ops.equal, 'x', 'y'),
+    (math_ops.floordiv, 'x', 'y'),
+    (math_ops.floormod, 'x', 'y'),
+    (math_ops.greater, 'x', 'y'),
+    (math_ops.greater_equal, 'x', 'y'),
+    (math_ops.less, 'x', 'y'),
+    (math_ops.less_equal, 'x', 'y'),
+    (math_ops.logical_and, 'x', 'y'),
+    (math_ops.logical_or, 'x', 'y'),
+    (math_ops.logical_xor, 'x', 'y'),
+    (math_ops.maximum, 'x', 'y'),
+    (math_ops.minimum, 'x', 'y'),
+    (math_ops.multiply, 'x', 'y'),
+    (math_ops.not_equal, 'x', 'y'),
+    (math_ops.pow, 'x', 'y'),
+    (math_ops.realdiv, 'x', 'y'),
+    (math_ops.squared_difference, 'x', 'y'),
+    (math_ops.subtract, 'x', 'y'),
+    (math_ops.truediv, 'x', 'y'),
+    (math_ops.truncatediv, 'x', 'y'),
+    (math_ops.truncatemod, 'x', 'y'),
+
+    # N-ary math operations
+    (math_ops.add_n, '[inputs]'),
+
+    # String operations
+    (string_ops.as_string, 'input'),
+    (string_ops.decode_base64, 'input'),
+    (string_ops.encode_base64, 'input'),
+    (string_ops.regex_full_match, 'input'),
+    (string_ops.regex_replace, 'input'),
+    (string_ops.string_join, '[inputs]'),
+    (string_ops.string_strip, 'input'),
+    (string_ops.string_to_hash_bucket, 'string_tensor'),
+    (string_ops.string_to_hash_bucket_fast, 'input'),
+    (string_ops.string_to_hash_bucket_strong, 'input'),
+    (string_ops.substr, 'input'),
+    (string_ops.unicode_script, 'input'),
+
+    # Array ops
+    (array_ops.check_numerics, 'tensor'),
+    (array_ops.identity, 'input'),
+    (array_ops.ones_like, 'tensor'),
+    (array_ops.zeros_like, 'tensor'),
+
+    # Parsing ops
+    (parsing_ops.decode_compressed, 'bytes'),
+    (parsing_ops.string_to_number, 'string_tensor'),
+]
+_add_elementwise_ops_to_this_module(_TF_ELEMENTWISE_OPS)
diff --git a/tensorflow/python/ops/ragged/ragged_elementwise_ops_test.py b/tensorflow/python/ops/ragged/ragged_elementwise_ops_test.py
new file mode 100644
index 00000000000..5dfa5cff45d
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_elementwise_ops_test.py
@@ -0,0 +1,449 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for ragged.elementwise_ops."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+import numpy as np
+
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import ragged
+from tensorflow.python.platform import googletest
+
+# Constants listing various op types to test.  Each elementwise operation
+# should be included in at least one list below, or tested separately if
+# necessary (e.g., because it expects additional arguments).
+UNARY_FLOAT_OPS = [
+    ragged.abs,
+    ragged.acos,
+    ragged.acosh,
+    ragged.angle,
+    ragged.asin,
+    ragged.asinh,
+    ragged.atan,
+    ragged.atanh,
+    ragged.ceil,
+    ragged.conj,
+    ragged.cos,
+    ragged.cosh,
+    ragged.digamma,
+    ragged.erf,
+    ragged.erfc,
+    ragged.exp,
+    ragged.expm1,
+    ragged.floor,
+    ragged.imag,
+    ragged.is_finite,
+    ragged.is_inf,
+    ragged.is_nan,
+    ragged.lgamma,
+    ragged.log,
+    ragged.log1p,
+    ragged.log_sigmoid,
+    ragged.negative,
+    ragged.real,
+    ragged.reciprocal,
+    ragged.rint,
+    ragged.round,
+    ragged.rsqrt,
+    ragged.sign,
+    ragged.sin,
+    ragged.sinh,
+    ragged.sqrt,
+    ragged.square,
+    ragged.tan,
+    ragged.as_string,
+    ragged.identity,
+    ragged.ones_like,
+    ragged.zeros_like,
+]
+UNARY_BOOL_OPS = [
+    ragged.logical_not,
+]
+UNARY_STRING_OPS = [
+    ragged.decode_base64,
+    ragged.encode_base64,
+    ragged.string_strip,
+    ragged.decode_compressed,
+]
+BINARY_FLOAT_OPS = [
+    ragged.add,
+    ragged.atan2,
+    ragged.complex,
+    ragged.div,
+    ragged.div_no_nan,
+    ragged.divide,
+    ragged.equal,
+    ragged.floordiv,
+    ragged.floormod,
+    ragged.greater,
+    ragged.greater_equal,
+    ragged.less,
+    ragged.less_equal,
+    ragged.maximum,
+    ragged.minimum,
+    ragged.multiply,
+    ragged.not_equal,
+    ragged.pow,
+    ragged.realdiv,
+    ragged.squared_difference,
+    ragged.subtract,
+    ragged.truediv,
+]
+BINARY_BOOL_OPS = [
+    ragged.logical_and,
+    ragged.logical_or,
+    ragged.logical_xor,
+]
+UNARY_INT_OPS = [
+    ragged.unicode_script,
+]
+BINARY_INT_OPS = [
+    ragged.truncatediv,
+    ragged.truncatemod,
+]
+
+
+class RaggedElementwiseOpsTest(test_util.TensorFlowTestCase,
+                               parameterized.TestCase):
+
+  def assertSameShape(self, x, y):
+    """Checks that x and y have the same shape (including ragged shapes)."""
+    if isinstance(x, ragged.RaggedTensor):
+      self.assertIsInstance(y, ragged.RaggedTensor)
+      self.assertEqual(x.ragged_rank, y.ragged_rank)
+      for (x_splits, y_splits) in zip(x.nested_row_splits, y.nested_row_splits):
+        self.assertAllEqual(x_splits, y_splits)
+      self.assertAllEqual(
+          array_ops.shape(x.inner_values), array_ops.shape(y.inner_values))
+    else:
+      self.assertIsInstance(y, ops.Tensor)
+      self.assertAllEqual(array_ops.shape(x), array_ops.shape(y))
+
+  @parameterized.parameters(
+      #=========================================================================
+      # Test different input shapes.
+      #=========================================================================
+      [
+          # 0-dimensional input
+          {'x': 12},
+          # 1-dimensional input
+          {'x': [1, -2, 3]},
+          # 2-dimensional input
+          {'x': [[-2, 3], [-3, 4]]},
+          {'x': ragged.constant_value([[-2, 3], [-3]], ragged_rank=1)},
+          # 3-dimensional inputs
+          {'x': [[[-2, 3], [3, 4]], [[7, 6], [5, 4]]]},
+          {'x': ragged.constant_value([[[-2, 3], [3, 4]], [[7, 6]]],
+                                      ragged_rank=1)},
+          {'x': ragged.constant_value([[[-2, 3, 4], []], [[7, 6]], []],
+                                      ragged_rank=2)},
+          ] +
+      #=========================================================================
+      # Test each unary op.
+      #=========================================================================
+      [{'x': ragged.constant_value([[-2.0, 3.0], [-3.0]]), 'op': op}
+       for op in UNARY_FLOAT_OPS] +
+      [{'x': ragged.constant_value([[True, False], [True]]), 'op': op}
+       for op in UNARY_BOOL_OPS] +
+      [{'x': ragged.constant_value([[18, 512], [12412]], np.int32), 'op': op}
+       for op in UNARY_INT_OPS] +
+      [{'x': ragged.constant_value([['abcd', 'efgh'], ['aabbccdd']]), 'op': op}
+       for op in UNARY_STRING_OPS] +
+      [
+          {'op': ragged.clip_by_value,
+           'x': ragged.constant_value([[-2.0, 3.0], [-3.0]]),
+           'clip_value_min': 0.1, 'clip_value_max': 4.0},
+          {'op': ragged.cast,
+           'x': ragged.constant_value([[-2.0, 3.0], [-3.0]]),
+           'dtype': dtypes.int32},
+          {'op': ragged.saturate_cast,
+           'x': ragged.constant_value([[-2.0, 3.0], [-3.0]]),
+           'dtype': dtypes.int32},
+          {'op': ragged.string_to_hash_bucket,
+           'x': ragged.constant_value([['abcd', 'efgh'], ['aabbccdd']]),
+           'num_buckets': 1000},
+          {'op': ragged.string_to_hash_bucket_fast,
+           'x': ragged.constant_value([['abcd', 'efgh'], ['aabbccdd']]),
+           'num_buckets': 1000},
+          {'op': ragged.string_to_hash_bucket_strong,
+           'x': ragged.constant_value([['abcd', 'efgh'], ['aabbccdd']]),
+           'num_buckets': 1000,
+           'key': [1231, 12512]},
+          {'op': ragged.string_to_number,
+           'x': ragged.constant_value([['-2.0', '3.0'], ['-3.0']])},
+          {'op': ragged.regex_full_match,
+           'x': ragged.constant_value([['hello', '123'], ['1+1']]),
+           'pattern': r'\w+'},
+          {'op': ragged.regex_replace,
+           'x': ragged.constant_value([['hello', '123'], ['1+1']]),
+           'pattern': r'\d',
+           'rewrite': '#'},
+          {'op': ragged.substr,
+           'x': ragged.constant_value([['hello', '123'], ['1+1']]),
+           'pos': 2, 'len': 3},
+          {'op': ragged.check_numerics,
+           'x': ragged.constant_value([[-2.0, 3.0], [-3.0]]),
+           'message': 'check-numerics'},
+      ]
+      )  # pyformat: disable
+  def testUnaryOp(self, x, op=ragged.abs, **extra_args):
+    x = ragged.convert_to_tensor_or_ragged_tensor(x)
+    result = op(x, **extra_args)
+
+    # Run the wrapped op on the dense values, for comparison.
+    dense_x = x.inner_values if isinstance(x, ragged.RaggedTensor) else x
+    expected_flat_values = array_ops.reshape(
+        op.__wrapped__(dense_x, **extra_args), [-1])
+
+    with self.test_session():
+      # Check that the result has the expected shape.
+      self.assertSameShape(x, result)
+
+      # Check that the result has the expected (flattened) values.
+      if isinstance(result, ragged.RaggedTensor):
+        result_flat_values = array_ops.reshape(result.inner_values, [-1])
+      else:
+        result_flat_values = array_ops.reshape(result, [-1])
+      self.assertAllEqual(expected_flat_values, result_flat_values)
+
+  @parameterized.parameters(
+      [
+          #=====================================================================
+          # Without broadcasting -- i.e., shapes match exactly.
+          #=====================================================================
+          # Shapes: x:(), y:()
+          {'x': 12,
+           'y': 8},
+          # Shapes: x:(3,), y:(3,)
+          {'x': [7, 8, 9],
+           'y': [1, -2, 3]},
+          # Shapes: x:(2, 2), y:(2, 2)
+          {'x': [[-2, 3], [-3, -4]],
+           'y': [[1, 2], [3, 4]]},
+          # Shapes: x:(2, None), y:(2, None)
+          {'x': ragged.constant_value([[-2, 3], [-3]]),
+           'y': ragged.constant_value([[5, 6], [7]])},
+          # Shapes: x:(2, 2, 2), y:(2, 2, 2)
+          {'x': [[[1, 2], [3, 4]], [[5, 6], [7, 8]]],
+           'y': [[[9, 3], [3, 4]], [[5, 2], [7, 6]]]},
+          # Shapes: x:(2, None, None), y: (2, None, None)
+          {'x': ragged.constant_value([[[1, 2], [3], [4]], [[], [5, 7, 8]]]),
+           'y': ragged.constant_value([[[3, 8], [2], [5]], [[], [1, 9, 8]]])},
+          # Shapes: x:(2, None, 2), y: (2, None, 2)
+          {'x': ragged.constant_value([[[1, 2]], [[3, 4], [5, 6], [7, 8]]],
+                                      ragged_rank=1),
+           'y': ragged.constant_value([[[9, 3]], [[5, 2], [3, 4], [7, 6]]],
+                                      ragged_rank=1)},
+
+          #=====================================================================
+          # With broadcasting
+          #=====================================================================
+          # Shapes: x:(), y:(3,)
+          {'x': 12,                                 # Broadcast () -> (3,)
+           'y': [1, -2, 3]},
+          # Shapes: x:(1,), y:(3,)
+          {'x': [12],                               # Broadcast (1,) -> (3,)
+           'y': [1, -2, 3]},
+          # Shapes: x:(), y:(2, 2)
+          {'x': 12,                                 # Broadcast () -> (2, 2)
+           'y': [[1, 2], [3, 4]]},
+          # Shapes: x:(1,), y:(2, 2)
+          {'x': 12,                                 # Broadcast (1,) -> (2, 2)
+           'y': [[1, 2], [3, 4]]},
+          # Shapes: x:(2, 1), y:(2, 2)
+          {'x': [[10], [20]],                       # Broadcast (2, 1) -> (2, 2)
+           'y': [[1, 2], [3, 4]]},
+          # Shapes: x:(), y:(2, None)
+          {'x': 10,                                 # Broadcast () -> (2, None)
+           'y': ragged.constant_value([[1, 2], [3]], dtype=np.int32)},
+          # TODO(edloper): Add tests for more advanced broadcasting, once we add
+          # support for it.
+
+          #=====================================================================
+          # Keyword Args
+          #=====================================================================
+          {'x': ragged.constant_value([[[1, 2], [3], [4]], [[], [5, 7, 8]]]),
+           'y': ragged.constant_value([[[3, 8], [2], [5]], [[], [1, 9, 8]]]),
+           'use_kwargs': True},
+          {'x': ragged.constant_value([[[1, 2]], [[3, 4], [5, 6], [7, 8]]],
+                                      ragged_rank=1),
+           'y': ragged.constant_value([[[9, 3]], [[5, 2], [3, 4], [7, 6]]],
+                                      ragged_rank=1),
+           'use_kwargs': True},
+      ] +
+      #=========================================================================
+      # Test each unary op.
+      #=========================================================================
+      [{'x': ragged.constant_value([[-2.0, 3.0], [-3.0]]),
+        'y': ragged.constant_value([[5.0, 1.0], [12.0]]),
+        'op': op}
+       for op in BINARY_FLOAT_OPS] +
+      [{'x': ragged.constant_value([[-2, 3], [-3]]),
+        'y': ragged.constant_value([[5, 1], [12]]),
+        'op': op}
+       for op in BINARY_INT_OPS] +
+      [{'x': ragged.constant_value([[True, True], [False]]),
+        'y': ragged.constant_value([[False, True], [False]]),
+        'op': op}
+       for op in BINARY_BOOL_OPS] +
+      [
+      ]
+      )  # pyformat: disable
+  def testBinaryOp(self, x, y, op=ragged.add, **extra_args):
+    use_kwargs = extra_args.pop('use_kwargs', False)
+    x = ragged.convert_to_tensor_or_ragged_tensor(x)
+    y = ragged.convert_to_tensor_or_ragged_tensor(y)
+    if use_kwargs:
+      result = op(x=x, y=y, **extra_args)
+    else:
+      result = op(x, y, **extra_args)
+
+    # Run the wrapped op on the dense values, for comparison.
+    dense_x = x.inner_values if isinstance(x, ragged.RaggedTensor) else x
+    dense_y = y.inner_values if isinstance(y, ragged.RaggedTensor) else y
+    expected_flat_values = array_ops.reshape(
+        op.__wrapped__(dense_x, dense_y, **extra_args), [-1])
+
+    with self.test_session():
+      # Check that the result has the expected shape.
+      self.assertSameShape(y, result)
+
+      # Check that the result has the expected (flattened) values.
+      if isinstance(result, ragged.RaggedTensor):
+        result_flat_values = array_ops.reshape(result.inner_values, [-1])
+      else:
+        result_flat_values = array_ops.reshape(result, [-1])
+      self.assertAllEqual(expected_flat_values, result_flat_values)
+
+  @parameterized.parameters(
+      [
+          {'inputs': (12, 8, 3)},
+          {'inputs': ([1, 2, 3], [7, 8, 9], [3, 6, 9])},
+          {'inputs': ([[1, 2]], [[3, 4]], [[5, 6]])},
+          {'inputs': (ragged.constant_value([[1, 3], [-3]]),
+                      ragged.constant_value([[4, 7], [88]]),
+                      ragged.constant_value([[2, 9], [12]]))},
+          {'inputs': (ragged.constant_value([[[1, 3], [-3]], [[1]]]),
+                      ragged.constant_value([[[4, 7], [88]], [[2]]]),
+                      ragged.constant_value([[[2, 9], [12]], [[8]]]))},
+          {'inputs': (ragged.constant_value([[[1, 3], [3, 4]], [[1, 5]]],
+                                            ragged_rank=1),
+                      ragged.constant_value([[[4, 7], [1, 2]], [[2, 2]]],
+                                            ragged_rank=1),
+                      ragged.constant_value([[[2, 9], [5, 2]], [[8, 0]]],
+                                            ragged_rank=1))},
+          {'inputs': (ragged.constant_value([[[1, 3], [-3]], [[1]]]),
+                      ragged.constant_value([[[4, 7], [88]], [[2]]]),
+                      ragged.constant_value([[[2, 9], [12]], [[8]]])),
+           'use_kwargs': True},
+      ] + [
+          {'op': ragged.add_n,
+           'inputs': (ragged.constant_value([[1, 3], [-3]]),
+                      ragged.constant_value([[4, 7], [88]]),
+                      ragged.constant_value([[2, 9], [12]]))},
+          {'op': ragged.string_join,
+           'inputs': (ragged.constant_value([['a', 'b'], ['c']]),
+                      ragged.constant_value([['foo', 'bar'], ['baz']]),
+                      ragged.constant_value([['2', '9'], ['12']]))},
+      ])  # pyformat: disable
+  def testListValuedOp(self, inputs, op=ragged.add_n, **extra_args):
+    use_kwargs = extra_args.pop('use_kwargs', False)
+    inputs = [ragged.convert_to_tensor_or_ragged_tensor(x) for x in inputs]
+    if use_kwargs:
+      result = op(inputs=inputs, **extra_args)
+    else:
+      result = op(inputs, **extra_args)
+
+    # Run the wrapped op on the dense values, for comparison.
+    dense_inputs = [
+        x.inner_values if isinstance(x, ragged.RaggedTensor) else x
+        for x in inputs
+    ]
+    expected_flat_values = array_ops.reshape(
+        op.__wrapped__(dense_inputs, **extra_args), [-1])
+
+    with self.test_session():
+      # Check that the result has the expected shape.
+      self.assertSameShape(inputs[0], result)
+
+      # Check that the result has the expected (flattened) values.
+      if isinstance(result, ragged.RaggedTensor):
+        result_flat_values = array_ops.reshape(result.inner_values, [-1])
+      else:
+        result_flat_values = array_ops.reshape(result, [-1])
+      self.assertAllEqual(expected_flat_values, result_flat_values)
+
+  def testUnknownRankError(self):
+    x = ragged.constant([[1, 2], [3]])
+    y = ragged.from_row_splits(
+        array_ops.placeholder_with_default([1, 2, 3], shape=None), x.row_splits)
+    with self.assertRaisesRegexp(
+        ValueError, r'Ragged elementwise ops require that rank \(number '
+        r'of dimensions\) be statically known.'):
+      ragged.add(x, y)
+
+  def testBroadcastError1(self):
+    x = ragged.constant([[1, 2], [3]])
+    y = [[12]]
+    with self.assertRaisesRegexp(
+        ValueError, 'Ragged elementwise ops do not support broadcasting yet'):
+      ragged.add(x, y)
+
+  def testBroadcastError2(self):
+    x = ragged.constant([[[1, 2], [3, 4]], [[5]]], ragged_rank=2)
+    y = ragged.constant([[[8], [3]], [[2]]], ragged_rank=1)
+    with self.assertRaisesRegexp(ValueError,
+                                 'Inputs must have identical ragged splits'):
+      ragged.add(x, y)
+
+  def testBroadcastError3(self):
+    x = ragged.constant([[[1, 2], [3]], [[4, 5], [6]]], ragged_rank=2)
+    y = ragged.constant([[7, 8], [9]], ragged_rank=1)
+    with self.assertRaisesRegexp(
+        ValueError, 'Ragged elementwise ops do not support broadcasting yet'):
+      ragged.add(x, y)
+
+  def testBroadcastError4(self):
+    x = ragged.constant([[[1]]])
+    y = ragged.constant([[1]])
+    with self.assertRaisesRegexp(
+        ValueError, 'Ragged elementwise ops do not support broadcasting yet'):
+      ragged.add(x, y)
+
+  def testShapeMismatch(self):
+    x = ragged.constant([[1, 2, 3], [4, 5]])
+    y = ragged.constant([[1, 2, 3], [4, 5, 6]])
+    with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                 'Inputs must have identical ragged splits'):
+      ragged.add(x, y)
+
+  def testDocstring(self):
+    self.assertRegexpMatches(
+        ragged.add.__doc__,
+        'Ragged version of the elementwise operation `tf.math.add`')
+    self.assertEqual(ragged.add.__name__, 'add')
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/python/ops/ragged/ragged_expand_dims_op_test.py b/tensorflow/python/ops/ragged/ragged_expand_dims_op_test.py
new file mode 100644
index 00000000000..0c4fd458c23
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_expand_dims_op_test.py
@@ -0,0 +1,125 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for ragged.expand_dims."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import ragged
+from tensorflow.python.platform import googletest
+
+
+class RaggedExpandDimsOpTest(test_util.TensorFlowTestCase,
+                             parameterized.TestCase):
+
+  # An example 4-d ragged tensor with shape [3, (D2), (D3), 2], and the
+  # expected result calling for expand_dims on each axis.  c.f. the table of
+  # expected result shapes in the ragged.expand_dims docstring.
+  EXAMPLE4D = [[[[1, 1], [2, 2]], [[3, 3]]],
+               [],
+               [[], [[4, 4], [5, 5], [6, 6]]]]  # pyformat: disable
+  EXAMPLE4D_EXPAND_AXIS = {
+      0: [EXAMPLE4D],
+      1: [[d0] for d0 in EXAMPLE4D],
+      2: [[[d1] for d1 in d0] for d0 in EXAMPLE4D],
+      3: [[[[d2] for d2 in d1] for d1 in d0] for d0 in EXAMPLE4D],
+      4: [[[[[d3] for d3 in d2] for d2 in d1] for d1 in d0] for d0 in EXAMPLE4D]
+  }
+
+  @parameterized.parameters([
+      #=========================================================================
+      # Docstring examples: 2D Ragged Inputs
+      dict(rt_input=[[1, 2], [3]],
+           axis=0,
+           expected=[[[1, 2], [3]]],
+           expected_shape=[1, None, None]),
+      dict(rt_input=[[1, 2], [3]],
+           axis=1,
+           expected=[[[1, 2]], [[3]]],
+           expected_shape=[2, None, None]),
+      dict(rt_input=[[1, 2], [3]],
+           axis=2,
+           expected=[[[1], [2]], [[3]]],
+           expected_shape=[2, None, 1]),
+
+      #=========================================================================
+      # 2D Tensor Inputs
+      dict(rt_input=[[1, 2], [3, 4], [5, 6]],
+           ragged_rank=0,
+           axis=0,
+           expected=[[[1, 2], [3, 4], [5, 6]]],
+           expected_shape=[1, 3, 2]),
+      dict(rt_input=[[1, 2], [3, 4], [5, 6]],
+           ragged_rank=0,
+           axis=1,
+           expected=[[[1, 2]], [[3, 4]], [[5, 6]]],
+           expected_shape=[3, 1, 2]),
+      dict(rt_input=[[1, 2], [3, 4], [5, 6]],
+           ragged_rank=0,
+           axis=2,
+           expected=[[[1], [2]], [[3], [4]], [[5], [6]]],
+           expected_shape=[3, 2, 1]),
+
+      #=========================================================================
+      # 4D Ragged Inputs: [3, (D2), (D3), 2]
+      # c.f. the table of expected result shapes in the expand_dims docstring.
+      dict(rt_input=EXAMPLE4D,
+           ragged_rank=2,
+           axis=0,
+           expected=EXAMPLE4D_EXPAND_AXIS[0],
+           expected_shape=[1, None, None, None, 2]),
+      dict(rt_input=EXAMPLE4D,
+           ragged_rank=2,
+           axis=1,
+           expected=EXAMPLE4D_EXPAND_AXIS[1],
+           expected_shape=[3, None, None, None, 2]),
+      dict(rt_input=EXAMPLE4D,
+           ragged_rank=2,
+           axis=2,
+           expected=EXAMPLE4D_EXPAND_AXIS[2],
+           expected_shape=[3, None, None, None, 2]),
+      dict(rt_input=EXAMPLE4D,
+           ragged_rank=2,
+           axis=3,
+           expected=EXAMPLE4D_EXPAND_AXIS[3],
+           expected_shape=[3, None, None, 1, 2]),
+      dict(rt_input=EXAMPLE4D,
+           ragged_rank=2,
+           axis=4,
+           expected=EXAMPLE4D_EXPAND_AXIS[4],
+           expected_shape=[3, None, None, 2, 1]),
+  ])  # pyformat: disable
+  def testRaggedExpandDims(self,
+                           rt_input,
+                           axis,
+                           expected,
+                           ragged_rank=None,
+                           expected_shape=None):
+    rt = ragged.constant(rt_input, ragged_rank=ragged_rank)
+    expanded = ragged.expand_dims(rt, axis=axis)
+    self.assertEqual(expanded.shape.ndims, rt.shape.ndims + 1)
+    if expected_shape is not None:
+      self.assertEqual(expanded.shape.as_list(), expected_shape)
+
+    with self.test_session():
+      self.assertEqual(expanded.eval().tolist(), expected)
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/python/ops/ragged/ragged_factory_ops.py b/tensorflow/python/ops/ragged/ragged_factory_ops.py
new file mode 100644
index 00000000000..de3a2d5b10b
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_factory_ops.py
@@ -0,0 +1,678 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Operations for constructing RaggedTensors."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.ragged import ragged_tensor
+from tensorflow.python.ops.ragged import ragged_tensor_value
+
+
+#===============================================================================
+# Op to construct a constant RaggedTensor from a nested Python list.
+#===============================================================================
+def constant(pylist, dtype=None, ragged_rank=None, inner_shape=None, name=None):
+  """Constructs a constant RaggedTensor from a nested Python list.
+
+  Example:
+
+  ```python
+  >>> ragged.constant([[1, 2], [3], [4, 5, 6]]).eval()
+  RaggedTensorValue(values=[1, 2, 3, 4, 5, 6], splits=[0, 2, 3, 6])
+  ```
+
+  All scalar values in `pylist` must have the same nesting depth `K`, and the
+  returned `RaggedTensor` will have rank `K`.  If `pylist` contains no scalar
+  values, then `K` is one greater than the maximum depth of empty lists in
+  `pylist`.  All scalar values in `pylist` must be compatible with `dtype`.
+
+  Args:
+    pylist: A nested `list` or `tuple`.  Any nested element that is not a `list`
+      or `tuple` must be a scalar value compatible with `dtype`.
+    dtype: The type of elements for the returned `RaggedTensor`.  If not
+      specified, then a default is chosen based on the scalar values in
+      `pylist`.
+    ragged_rank: An integer specifying the ragged rank of the returned
+      `RaggedTensor`.  Must be nonnegative and less than `K`. Defaults to
+      `max(0, K - 1)` if `inner_shape` is not specified.  Defaults to
+      `max(0, K - 1 - len(inner_shape))` if `inner_shape` is specified.
+    inner_shape: A tuple of integers specifying the shape for individual inner
+      values in the returned `RaggedTensor`.  Defaults to `()` if `ragged_rank`
+      is not specified.  If `ragged_rank` is specified, then a default is chosen
+      based on the contents of `pylist`.
+    name: A name prefix for the returned tensor (optional).
+
+  Returns:
+    A potentially ragged tensor with rank `K` and the specified `ragged_rank`,
+    containing the values from `pylist`.
+
+  Raises:
+    ValueError: If the scalar values in `pylist` have inconsistent nesting
+      depth; or if ragged_rank or inner_shape are incompatible with `pylist`.
+  """
+  with ops.name_scope(name, 'RaggedConstant'):
+    return _constant_value(from_row_splits, constant_op.constant, pylist, dtype,
+                           ragged_rank, inner_shape)
+
+
+def constant_value(pylist, dtype=None, ragged_rank=None, inner_shape=None):
+  """Constructs a RaggedTensorValue from a nested Python list.
+
+  > Warning: This function returns a `RaggedTensorValue`, not a `RaggedTensor`.
+  > If you wish to construct a constant `RaggedTensor`, use
+  > [`ragged.constant(...)`](constant.md) instead.
+
+  Example:
+
+  ```python
+  >>> ragged.constant_value([[1, 2], [3], [4, 5, 6]])
+  RaggedTensorValue(values=[1, 2, 3, 4, 5, 6], splits=[0, 2, 3, 6])
+  ```
+
+  All scalar values in `pylist` must have the same nesting depth `K`, and the
+  returned `RaggedTensorValue` will have rank `K`.  If `pylist` contains no
+  scalar values, then `K` is one greater than the maximum depth of empty lists
+  in `pylist`.  All scalar values in `pylist` must be compatible with `dtype`.
+
+  Args:
+    pylist: A nested `list` or `tuple`.  Any nested element that is not a `list`
+      or `tuple` must be a scalar value compatible with `dtype`.
+    dtype: `numpy.dtype`.  The type of elements for the returned `RaggedTensor`.
+      If not specified, then a default is chosen based on the scalar values in
+      `pylist`.
+    ragged_rank: An integer specifying the ragged rank of the returned
+      `RaggedTensorValue`.  Must be nonnegative and less than `K`. Defaults to
+      `max(0, K - 1)` if `inner_shape` is not specified.  Defaults to `max(0, K
+      - 1 - len(inner_shape))` if `inner_shape` is specified.
+    inner_shape: A tuple of integers specifying the shape for individual inner
+      values in the returned `RaggedTensorValue`.  Defaults to `()` if
+      `ragged_rank` is not specified.  If `ragged_rank` is specified, then a
+      default is chosen based on the contents of `pylist`.
+
+  Returns:
+    A `RaggedTensorValue` or `numpy.array` with rank `K` and the specified
+    `ragged_rank`, containing the values from `pylist`.
+
+  Raises:
+    ValueError: If the scalar values in `pylist` have inconsistent nesting
+      depth; or if ragged_rank or inner_shape are incompatible with `pylist`.
+  """
+
+  def _ragged_factory(values, row_splits):
+    row_splits = np.array(row_splits, dtype=np.int64)
+    return ragged_tensor_value.RaggedTensorValue(values, row_splits)
+
+  def _inner_factory(pylist, dtype, shape, name=None):  # pylint: disable=unused-argument
+    return np.reshape(np.array(pylist, dtype=dtype), shape)
+
+  return _constant_value(_ragged_factory, _inner_factory, pylist, dtype,
+                         ragged_rank, inner_shape)
+
+
+def _constant_value(ragged_factory, inner_factory, pylist, dtype, ragged_rank,
+                    inner_shape):
+  """Constructs a constant RaggedTensor or RaggedTensorValue.
+
+  Args:
+    ragged_factory: A factory function with the signature:
+      `ragged_factory(values, row_splits)`
+    inner_factory: A factory function with the signature: `inner_factory(pylist,
+      dtype, shape, name)`
+    pylist: A nested `list` or `tuple`.
+    dtype: Data type for returned value.
+    ragged_rank: Ragged rank for returned value.
+    inner_shape: Inner value shape for returned value.
+
+  Returns:
+    A value returned by `ragged_factory` or `inner_factory`.
+
+  Raises:
+    ValueError: If the scalar values in `pylist` have inconsistent nesting
+      depth; or if ragged_rank or inner_shape are incompatible with `pylist`.
+  """
+  if ragged_tensor.is_ragged(pylist):
+    raise TypeError('pylist may not be a RaggedTensor or RaggedTensorValue.')
+
+  if not isinstance(pylist, (list, tuple)):
+    # Scalar value
+    if ragged_rank is not None and ragged_rank != 0:
+      raise ValueError('Invalid pylist=%r: incompatible with ragged_rank=%d' %
+                       (pylist, ragged_rank))
+    if inner_shape is not None and inner_shape:
+      raise ValueError(
+          'Invalid pylist=%r: incompatible with dim(inner_shape)=%d' %
+          (pylist, len(inner_shape)))
+    return inner_factory(pylist, dtype, ())
+
+  if ragged_rank is not None and ragged_rank < 0:
+    raise ValueError(
+        'Invalid ragged_rank=%r: must be nonnegative' % ragged_rank)
+
+  # Find the depth of scalar values in `pylist`.
+  scalar_depth, max_depth = _find_scalar_and_max_depth(pylist)
+  if scalar_depth is not None:
+    if max_depth > scalar_depth:
+      raise ValueError('Invalid pylist=%r: empty list nesting is greater '
+                       'than scalar value nesting' % pylist)
+
+  # If both inner_shape and ragged_rank were specified, then check that
+  # they are compatible with pylist.
+  if inner_shape is not None and ragged_rank is not None:
+    expected_depth = ragged_rank + len(inner_shape) + 1
+    if ((scalar_depth is not None and expected_depth != scalar_depth) or
+        (scalar_depth is None and expected_depth < max_depth)):
+      raise ValueError(
+          'Invalid pylist=%r: incompatible with ragged_rank=%d '
+          'and dim(inner_shape)=%d' % (pylist, ragged_rank, len(inner_shape)))
+
+  # Check if the result is a `Tensor`.
+  if (ragged_rank == 0 or
+      (ragged_rank is None and
+       ((max_depth < 2) or
+        (inner_shape is not None and max_depth - len(inner_shape) < 2)))):
+    return inner_factory(pylist, dtype, inner_shape)
+
+  # Compute default value for inner_shape.
+  if inner_shape is None:
+    if ragged_rank is None:
+      inner_shape = ()
+    else:
+      inner_shape = _default_inner_shape_for_pylist(pylist, ragged_rank)
+
+  # Compute default value for ragged_rank.
+  if ragged_rank is None:
+    if scalar_depth is None:
+      ragged_rank = max(1, max_depth - 1)
+    else:
+      ragged_rank = max(1, scalar_depth - 1 - len(inner_shape))
+
+  # Build the splits for each ragged rank, and concatenate the inner values
+  # into a single list.
+  nested_splits = []
+  values = pylist
+  for dim in range(ragged_rank):
+    nested_splits.append([0])
+    concatenated_values = []
+    for row in values:
+      nested_splits[dim].append(nested_splits[dim][-1] + len(row))
+      concatenated_values.extend(row)
+    values = concatenated_values
+
+  values = inner_factory(
+      values, dtype=dtype, shape=(len(values),) + inner_shape, name='values')
+  for row_splits in reversed(nested_splits):
+    values = ragged_factory(values, row_splits)
+  return values
+
+
+def _find_scalar_and_max_depth(pylist):
+  """Finds nesting depth of scalar values in pylist.
+
+  Args:
+    pylist: A nested python `list` or `tuple`.
+
+  Returns:
+    A tuple `(scalar_depth, max_depth)`.  `scalar_depth` is the nesting
+    depth of scalar values in `pylist`, or `None` if `pylist` contains no
+    scalars.  `max_depth` is the maximum depth of `pylist` (including
+    empty lists).
+
+  Raises:
+    ValueError: If pylist has inconsistent nesting depths for scalars.
+  """
+  if isinstance(pylist, (list, tuple)):
+    scalar_depth = None
+    max_depth = 1
+    for child in pylist:
+      child_scalar_depth, child_max_depth = _find_scalar_and_max_depth(child)
+      if child_scalar_depth is not None:
+        if scalar_depth is not None and scalar_depth != child_scalar_depth + 1:
+          raise ValueError('all scalar values must have the same nesting depth')
+        scalar_depth = child_scalar_depth + 1
+      max_depth = max(max_depth, child_max_depth + 1)
+    return (scalar_depth, max_depth)
+  else:
+    return (0, 0)
+
+
+def _default_inner_shape_for_pylist(pylist, ragged_rank):
+  """Computes a default inner shape for the given python list."""
+
+  def get_inner_shape(item):
+    """Returns the inner shape for a python list `item`."""
+    if not isinstance(item, (list, tuple)):
+      return ()
+    elif item:
+      return (len(item),) + get_inner_shape(item[0])
+    else:
+      return (0,)
+
+  def check_inner_shape(item, shape):
+    """Checks that `item` has a consistent shape matching `shape`."""
+    is_nested = isinstance(item, (list, tuple))
+    if is_nested != bool(shape):
+      raise ValueError('inner values have inconsistent shape')
+    if is_nested:
+      if shape[0] != len(item):
+        raise ValueError('inner values have inconsistent shape')
+      for child in item:
+        check_inner_shape(child, shape[1:])
+
+  # Collapse the ragged layers to get the list of inner values.
+  inner_values = pylist
+  for dim in range(ragged_rank):
+    if not all(isinstance(v, (list, tuple)) for v in inner_values):
+      raise ValueError('pylist has scalar values depth %d, but ragged_rank=%d '
+                       'requires scalar value depth greater than %d' %
+                       (dim + 1, ragged_rank, ragged_rank))
+    inner_values = sum((list(v) for v in inner_values), [])
+
+  # Compute the inner shape looking only at the leftmost elements; and then
+  # use check_inner_shape to verify that other elements have the same shape.
+  inner_shape = get_inner_shape(inner_values)
+  check_inner_shape(inner_values, inner_shape)
+  return inner_shape[1:]
+
+
+#===============================================================================
+# Convert value -> tensor
+#===============================================================================
+def convert_to_tensor_or_ragged_tensor(value,
+                                       dtype=None,
+                                       preferred_dtype=None,
+                                       name=None):
+  """Converts value to a `RaggedTensor` or `Tensor`.
+
+  * If `value` is a `RaggedTensor`, then return it as-is.
+  * If `value` is a `RaggedTensorValue`, return a corresponding constant
+    `RaggedTensor`.
+  * Otherwise, use `convert_to_tensor` to convert `value` to a `Tensor`.
+
+  Args:
+    value: A `RaggedTensor`, a `RaggedTensorValue`, or an object whose type has
+      a registered `Tensor` conversion function.
+    dtype: Optional element type for the returned tensor.  If missing the type
+      is inferred from the type of `value`.
+    preferred_dtype: Optional element type for the returned tensor, used when
+      dtype is None.  This argument has no effect if `value` is already a
+      tensor, or when conversion is not possible.
+    name: Optional name to use if a new `Tensor` is created.
+
+  Returns:
+    A `Tensor` or `RaggedTensor`.
+  """
+  if isinstance(value, ragged_tensor.RaggedTensor):
+    if dtype and not dtype.is_compatible_with(value.dtype):
+      raise ValueError('Tensor conversion requested dtype %s for '
+                       'RaggedTensor with dtype %s: %r' %
+                       (dtype.name, value.dtype.name, value))
+    return value
+  elif isinstance(value, ragged_tensor_value.RaggedTensorValue):
+    with ops.name_scope(name, 'ConvertToTensorOrRaggedTensor', []):
+      inner_values = ops.convert_to_tensor(
+          value=value.inner_values,
+          dtype=dtype,
+          preferred_dtype=preferred_dtype,
+          name='inner_values')
+      return from_nested_row_splits(inner_values, value.nested_row_splits)
+  else:
+    return ops.convert_to_tensor(
+        value=value, dtype=dtype, preferred_dtype=preferred_dtype, name=name)
+
+
+#===============================================================================
+# Ops to construct RaggedTensor from row-partitioned values.
+#===============================================================================
+
+
+def from_value_rowids(values, value_rowids, nrows=None, name=None):
+  """Creates a `RaggedTensor` with rows partitioned by `value_rowids`.
+
+  The returned `RaggedTensor` corresponds with the python list defined by:
+
+  ```python
+  result = [[values[i] for i in range(len(values)) if value_rowids[i] == row]
+            for row in range(nrows)]
+  ```
+
+  Warning: currently, this needs to cast value_rowids to int64 before
+  converting, since `tf.bincount` only supports `int32`.
+
+  Args:
+    values: A potentially ragged tensor with shape `[nvals, ...]`.
+    value_rowids: A 1-D int64 tensor with shape `[nvals]`, which corresponds
+      one-to-one with `values`, and specifies each value's row index.  Must be
+      nonnegative, and must be sorted in ascending order.
+    nrows: An int64 scalar specifying the number of rows.  This should be
+      specified if the `RaggedTensor` may containing empty training rows.  Must
+      be greater than `value_rowids[-1]` (or zero if `value_rowids` is empty).
+      Defaults to `value_rowids[-1]` (or zero if `value_rowids` is empty).
+    name: A name prefix for the RaggedTensor (optional).
+
+  Returns:
+    A `RaggedTensor`.  `result.rank = values.rank + 1`.
+    `result.ragged_rank = values.ragged_rank + 1`.
+
+  Raises:
+    ValueError: If `nrows` is incompatible with `value_rowids`.
+
+  #### Example:
+    ```python
+    >>> rt = ragged.from_value_rowids(
+    ...     values=[3, 1, 4, 1, 5, 9, 2, 6],
+    ...     value_rowids=[0, 0, 0, 0, 2, 2, 2, 3],
+    ...     nrows=5)
+    >>> rt.eval().tolist()
+    [[3, 1, 4, 1], [], [5, 9, 2], [6], []]
+    ```
+  """
+  with ops.name_scope(name, 'RaggedFromValueRowIds',
+                      [values, value_rowids, nrows]):
+    values = convert_to_tensor_or_ragged_tensor(values, name='values')
+    value_rowids = ops.convert_to_tensor(
+        value_rowids, dtypes.int64, name='value_rowids')
+    if nrows is None:
+      const_rowids = tensor_util.constant_value(value_rowids)
+      if const_rowids is None:
+        nrows = array_ops.concat([value_rowids[-1:], [-1]], axis=0)[0] + 1
+        const_nrows = None
+      else:
+        const_nrows = const_rowids[-1] + 1 if const_rowids.size > 0 else 0
+        nrows = ops.convert_to_tensor(const_nrows, dtypes.int64, name='nrows')
+    else:
+      nrows = ops.convert_to_tensor(nrows, dtypes.int64, 'nrows')
+      const_nrows = tensor_util.constant_value(nrows)
+      if const_nrows is not None:
+        if const_nrows < 0:
+          raise ValueError('Expected nrows >= 0; got %d' % const_nrows)
+        const_rowids = tensor_util.constant_value(value_rowids)
+        if const_rowids is not None and const_rowids.size > 0:
+          if not const_nrows >= const_rowids[-1] + 1:
+            raise ValueError(
+                'Expected nrows >= value_rowids[-1] + 1; got nrows=%d, '
+                'value_rowids[-1]=%d' % (const_nrows, const_rowids[-1]))
+
+    value_rowids.shape.assert_has_rank(1)
+    nrows.shape.assert_has_rank(0)
+    values.shape[:1].assert_is_compatible_with(value_rowids.shape)
+
+    # Convert value_rowids & nrows to row_splits.
+    # Note: we don't use segment_ids_to_row_splits() here because we want
+    # to save the intermediate value `row_lengths`, so we can cache it.
+    # TODO(b/116708836) Upgrade bincount to accept int64 so we can skip the cast
+    # (Remove the warning in the docstring when we do.)
+    value_rowids_int32 = math_ops.cast(value_rowids, dtypes.int32)
+    nrows_int32 = math_ops.cast(nrows, dtypes.int32)
+    row_lengths = math_ops.bincount(
+        value_rowids_int32,
+        minlength=nrows_int32,
+        maxlength=nrows_int32,
+        dtype=dtypes.int64)
+    row_splits = array_ops.concat([[0], math_ops.cumsum(row_lengths)], axis=0)
+    if const_nrows is not None:
+      row_lengths.set_shape([const_nrows])
+      row_splits.set_shape([const_nrows + 1])
+
+    return ragged_tensor.RaggedTensor(
+        values,
+        row_splits,
+        cached_row_lengths=row_lengths,
+        cached_value_rowids=value_rowids,
+        cached_nrows=nrows,
+        internal=True)
+
+
+def from_row_splits(values, row_splits, name=None):
+  """Creates a `RaggedTensor` with rows partitioned by `row_splits`.
+
+  The returned `RaggedTensor` corresponds with the python list defined by:
+
+  ```python
+  result = [values[row_splits[i]:row_splits[i + 1]]
+            for i in range(len(row_splits) - 1)]
+  ```
+
+  Args:
+    values: A potentially ragged tensor with shape `[nvals, ...]`.
+    row_splits: A 1-D int64 tensor with shape `[nrows+1]`.  Must not be empty,
+      and must be sorted in ascending order.  `row_splits[0]` must be zero and
+      `row_splits[-1]` must be `nvals`.
+    name: A name prefix for the RaggedTensor (optional).
+
+  Returns:
+    A `RaggedTensor`.  `result.rank = values.rank + 1`.
+    `result.ragged_rank = values.ragged_rank + 1`.
+
+  Raises:
+    ValueError: If `row_splits` is an empty list.
+
+  #### Example:
+    ```python
+    >>> rt = ragged.from_row_splits(
+    ...     values=[3, 1, 4, 1, 5, 9, 2, 6],
+    ...     row_splits=[0, 4, 4, 7, 8, 8])
+    >>> rt.eval().tolist()
+    [[3, 1, 4, 1], [], [5, 9, 2], [6], []]
+    ```
+  """
+  if isinstance(row_splits, (list, tuple)) and not row_splits:
+    raise ValueError('row_splits tensor may not be empty.')
+  with ops.name_scope(name, 'RaggedFromRowSplits', [values, row_splits]):
+    values = convert_to_tensor_or_ragged_tensor(values, name='values')
+    row_splits = ops.convert_to_tensor(row_splits, dtypes.int64, 'row_splits')
+    row_splits.shape.assert_has_rank(1)
+    return ragged_tensor.RaggedTensor(
+        values=values, row_splits=row_splits, internal=True)
+
+
+def from_row_lengths(values, row_lengths, name=None):
+  """Creates a `RaggedTensor` with rows partitioned by `row_lengths`.
+
+  The returned `RaggedTensor` corresponds with the python list defined by:
+
+  ```python
+  result = [[values.pop(0) for i in range(length)]
+            for length in row_lengths]
+  ```
+
+  Args:
+    values: A potentially ragged tensor with shape `[nvals, ...]`.
+    row_lengths: A 1-D int64 tensor with shape `[nrows]`.  Must be nonnegative.
+      `sum(row_lengths)` must be `nvals`.
+    name: A name prefix for the RaggedTensor (optional).
+
+  Returns:
+    A `RaggedTensor`.  `result.rank = values.rank + 1`.
+    `result.ragged_rank = values.ragged_rank + 1`.
+
+  #### Example:
+    ```python
+    >>> rt = ragged.from_row_lengths(
+    ...     values=[3, 1, 4, 1, 5, 9, 2, 6],
+    ...     row_lengths=[4, 0, 3, 1, 0])
+    >>> rt.eval().tolist()
+    [[3, 1, 4, 1], [], [5, 9, 2], [6], []]
+    ```
+  """
+  with ops.name_scope(name, 'RaggedFromRowLengths', [values, row_lengths]):
+    values = convert_to_tensor_or_ragged_tensor(values, name='values')
+    row_lengths = ops.convert_to_tensor(row_lengths, dtypes.int64,
+                                        'row_lengths')
+    row_lengths.shape.assert_has_rank(1)
+    row_limits = math_ops.cumsum(row_lengths)
+    row_splits = array_ops.concat([[0], row_limits], axis=0)
+    return ragged_tensor.RaggedTensor(
+        values=values,
+        row_splits=row_splits,
+        cached_row_lengths=row_lengths,
+        internal=True)
+
+
+def from_row_starts(values, row_starts, name=None):
+  """Creates a `RaggedTensor` with rows partitioned by `row_starts`.
+
+  Equivalent to: `from_row_splits(values, concat([row_starts, nvals]))`.
+
+  Args:
+    values: A potentially ragged tensor with shape `[nvals, ...]`.
+    row_starts: A 1-D int64 tensor with shape `[nrows]`.  Must be nonnegative
+      and sorted in ascending order.  If `nrows>0`, then `row_starts[0]` must be
+      zero.
+    name: A name prefix for the RaggedTensor (optional).
+
+  Returns:
+    A `RaggedTensor`.  `result.rank = values.rank + 1`.
+    `result.ragged_rank = values.ragged_rank + 1`.
+
+  #### Example:
+    ```python
+    >>> rt = ragged.from_row_starts(
+    ...     values=[3, 1, 4, 1, 5, 9, 2, 6],
+    ...     row_starts=[0, 4, 4, 7, 8])
+    >>> rt.eval().tolist()
+    [[3, 1, 4, 1], [], [5, 9, 2], [6], []]
+    ```
+  """
+  with ops.name_scope(name, 'RaggedFromRowStarts', [values, row_starts]):
+    values = convert_to_tensor_or_ragged_tensor(values, name='values')
+    row_starts = ops.convert_to_tensor(row_starts, dtypes.int64, 'row_starts')
+    row_starts.shape.assert_has_rank(1)
+    nvals = array_ops.shape(values, out_type=dtypes.int64)[:1]
+    row_splits = array_ops.concat([row_starts, nvals], axis=0)
+    return ragged_tensor.RaggedTensor(
+        values=values, row_splits=row_splits, internal=True)
+
+
+def from_row_limits(values, row_limits, name=None):
+  """Creates a `RaggedTensor` with rows partitioned by `row_limits`.
+
+  Equivalent to: `from_row_splits(values, concat([0, row_limits]))`.
+
+  Args:
+    values: A potentially ragged tensor with shape `[nvals, ...]`.
+    row_limits: A 1-D int64 tensor with shape `[nrows]`.  Must be sorted in
+      ascending order.  If `nrows>0`, then `row_limits[-1]` must be `nvals`.
+    name: A name prefix for the RaggedTensor (optional).
+
+  Returns:
+    A `RaggedTensor`.  `result.rank = values.rank + 1`.
+    `result.ragged_rank = values.ragged_rank + 1`.
+
+  #### Example:
+    ```python
+    >>> rt = ragged.from_row_limits(
+    ...     values=[3, 1, 4, 1, 5, 9, 2, 6],
+    ...     row_limits=[4, 4, 7, 8, 8])
+    >>> rt.eval().tolist()
+    [[3, 1, 4, 1], [], [5, 9, 2], [6], []]
+    ```
+  """
+  with ops.name_scope(name, 'RaggedFromRowLimits', [values, row_limits]):
+    values = convert_to_tensor_or_ragged_tensor(values, name='values')
+    row_limits = ops.convert_to_tensor(row_limits, dtypes.int64, 'row_limits')
+    row_limits.shape.assert_has_rank(1)
+    zero = array_ops.zeros([1], dtypes.int64)
+    row_splits = array_ops.concat([zero, row_limits], axis=0)
+    return ragged_tensor.RaggedTensor(
+        values=values, row_splits=row_splits, internal=True)
+
+
+def from_nested_value_rowids(inner_values,
+                             nested_value_rowids,
+                             nested_nrows=None,
+                             name=None):
+  """Creates a `RaggedTensor` from a nested list of `value_rowids` tensors.
+
+  Equivalent to:
+
+  ```python
+  result = inner_values
+  for (value_rowids, nrows) in reversed(zip(nested_value_rowids, nested_nrows)):
+    result = from_value_rowids(result, value_rowids, nrows)
+  ```
+
+  Args:
+    inner_values: A potentially ragged tensor.
+    nested_value_rowids: A list of 1-D int64 tensors.  The `i`th tensor is used
+      as the `value_rowids` for the `i`th ragged dimension.
+    nested_nrows: A list of int64 scalars.  The `i`th scalar is used as the
+      `nrows` for the `i`th ragged dimension.
+    name: A name prefix for the RaggedTensor (optional).
+
+  Returns:
+    A `RaggedTensor` (or `inner_values` if `nested_value_rowids` is empty).
+
+  Raises:
+    ValueError: If `len(nested_values_rowids) != len(nested_nrows)`.
+  """
+  if isinstance(nested_value_rowids, ops.Tensor):
+    raise TypeError('nested_value_rowids must be a list of Tensors')
+  if nested_nrows is None:
+    nested_nrows = [None] * len(nested_value_rowids)
+  else:
+    if isinstance(nested_nrows, ops.Tensor):
+      raise TypeError('nested_nrows must be a list of Tensors')
+    if len(nested_nrows) != len(nested_value_rowids):
+      raise ValueError('nested_nrows must have the same length as '
+                       'nested_value_rowids')
+
+  with ops.name_scope(
+      name, 'RaggedFromNestedValueRowIds',
+      [inner_values] + list(nested_value_rowids) + list(nested_nrows)):
+    result = inner_values
+    for value_rowids, nrows in reversed(
+        list(zip(nested_value_rowids, nested_nrows))):
+      result = from_value_rowids(result, value_rowids, nrows)
+    return result
+
+
+def from_nested_row_splits(inner_values, nested_row_splits, name=None):
+  """Creates a `RaggedTensor` from a nested list of `row_splits` tensors.
+
+  Equivalent to:
+
+  ```python
+  result = inner_values
+  for row_splits in reversed(nested_row_splits):
+    result = from_row_splits(result, row_splits)
+  ```
+
+  Args:
+    inner_values: A potentially ragged tensor.
+    nested_row_splits: A list of 1-D int64 tensors.  The `i`th tensor is used as
+      the `row_splits` for the `i`th ragged dimension.
+    name: A name prefix for the RaggedTensor (optional).
+
+  Returns:
+    A `RaggedTensor` (or `inner_values` if `nested_row_splits` is empty).
+  """
+  if isinstance(nested_row_splits, ops.Tensor):
+    raise TypeError('nested_row_splits must be a list of Tensors')
+  with ops.name_scope(name, 'RaggedFromNestedRowSplits',
+                      [inner_values] + list(nested_row_splits)):
+    result = inner_values
+    for splits in reversed(nested_row_splits):
+      result = from_row_splits(result, splits)
+    return result
diff --git a/tensorflow/python/ops/ragged/ragged_from_sparse_op_test.py b/tensorflow/python/ops/ragged/ragged_from_sparse_op_test.py
new file mode 100644
index 00000000000..ff19ddedebf
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_from_sparse_op_test.py
@@ -0,0 +1,95 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for ragged.from_sparse."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import ragged
+from tensorflow.python.platform import googletest
+
+
+class RaggedTensorToSparseOpTest(test_util.TensorFlowTestCase):
+
+  def testDocStringExample(self):
+    st = sparse_tensor.SparseTensor(
+        indices=[[0, 0], [0, 1], [0, 2], [1, 0], [3, 0]],
+        values=[1, 2, 3, 4, 5],
+        dense_shape=[4, 3])
+    rt = ragged.from_sparse(st)
+
+    with self.test_session():
+      self.assertEqual(rt.eval().tolist(), [[1, 2, 3], [4], [], [5]])
+
+  def testEmpty(self):
+    st = sparse_tensor.SparseTensor(
+        indices=array_ops.zeros([0, 2], dtype=dtypes.int64),
+        values=[],
+        dense_shape=[4, 3])
+    rt = ragged.from_sparse(st)
+
+    with self.test_session():
+      self.assertEqual(rt.eval().tolist(), [[], [], [], []])
+
+  def testBadSparseTensorRank(self):
+    st1 = sparse_tensor.SparseTensor(indices=[[0]], values=[0], dense_shape=[3])
+    st2 = sparse_tensor.SparseTensor(
+        indices=[[0, 0, 0]], values=[0], dense_shape=[3, 3, 3])
+    st3 = sparse_tensor.SparseTensor(
+        indices=array_ops.placeholder(dtypes.int64),
+        values=[0],
+        dense_shape=array_ops.placeholder(dtypes.int64))
+    self.assertRaisesRegexp(ValueError, r'rank\(st_input\) must be 2',
+                            ragged.from_sparse, st1)
+    self.assertRaisesRegexp(ValueError, r'rank\(st_input\) must be 2',
+                            ragged.from_sparse, st2)
+    self.assertRaisesRegexp(ValueError, r'rank\(st_input\) must be 2',
+                            ragged.from_sparse, st3)
+
+  def testNonRaggedSparseTensor(self):
+    # "index_suffix" means the value of the innermost dimension of the index
+    # (i.e., indices[i][-1]).
+    # See comments in _assert_sparse_indices_are_ragged_right() for more
+    # details/background.
+
+    # index_suffix of first index is not zero.
+    st1 = sparse_tensor.SparseTensor(
+        indices=[[0, 1], [0, 2], [2, 0]], values=[1, 2, 3], dense_shape=[3, 3])
+    # index_suffix of an index that starts a new row is not zero.
+    st2 = sparse_tensor.SparseTensor(
+        indices=[[0, 0], [0, 1], [2, 1]], values=[1, 2, 3], dense_shape=[3, 3])
+    # index_suffix of an index that continues a row skips a cell.
+    st3 = sparse_tensor.SparseTensor(
+        indices=[[0, 1], [0, 1], [0, 3]], values=[1, 2, 3], dense_shape=[3, 3])
+    rt1 = ragged.from_sparse(st1)
+    rt2 = ragged.from_sparse(st2)
+    rt3 = ragged.from_sparse(st3)
+    with self.test_session():
+      self.assertRaisesRegexp(errors.InvalidArgumentError,
+                              r'.*SparseTensor is not right-ragged', rt1.eval)
+      self.assertRaisesRegexp(errors.InvalidArgumentError,
+                              r'.*SparseTensor is not right-ragged', rt2.eval)
+      self.assertRaisesRegexp(errors.InvalidArgumentError,
+                              r'.*SparseTensor is not right-ragged', rt3.eval)
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/python/ops/ragged/ragged_from_tensor_op_test.py b/tensorflow/python/ops/ragged/ragged_from_tensor_op_test.py
new file mode 100644
index 00000000000..eb237f4c956
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_from_tensor_op_test.py
@@ -0,0 +1,462 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for ragged.from_tensor."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import ragged
+from tensorflow.python.platform import googletest
+
+
+class RaggedFromTensorOpTest(test_util.TensorFlowTestCase,
+                             parameterized.TestCase):
+
+  def testDocStringExamples(self):
+    # The examples from ragged.from_tensor.__doc__.
+    dt = constant_op.constant([[5, 7, 0], [0, 3, 0], [6, 0, 0]])
+    with self.test_session():
+      self.assertEqual(
+          ragged.from_tensor(dt).eval().tolist(),
+          [[5, 7, 0], [0, 3, 0], [6, 0, 0]])
+
+      self.assertEqual(
+          ragged.from_tensor(dt, lengths=[1, 0, 3]).eval().tolist(),
+          [[5], [], [6, 0, 0]])
+
+      self.assertEqual(
+          ragged.from_tensor(dt, padding=0).eval().tolist(),
+          [[5, 7], [0, 3], [6]])
+
+  @parameterized.parameters(
+      # 2D test cases, no length or padding.
+      {
+          'tensor': [[]],
+          'expected': [[]],
+      },
+      {
+          'tensor': [[1]],
+          'expected': [[1]],
+      },
+      {
+          'tensor': [[1, 2]],
+          'expected': [[1, 2]],
+      },
+      {
+          'tensor': [[1], [2], [3]],
+          'expected': [[1], [2], [3]],
+      },
+      {
+          'tensor': [[1, 2, 3], [4, 5, 6], [7, 8, 9]],
+          'expected': [[1, 2, 3], [4, 5, 6], [7, 8, 9]],
+      },
+      # 3D test cases, no length or padding
+      {
+          'tensor': [[[]]],
+          'expected': [[[]]],
+      },
+      {
+          'tensor': [[[]]],
+          'expected': [[[]]],
+          'ragged_rank': 1,
+      },
+      {
+          'tensor': [[[1]]],
+          'expected': [[[1]]],
+      },
+      {
+          'tensor': [[[1, 2]]],
+          'expected': [[[1, 2]]],
+      },
+      {
+          'tensor': [[[1, 2], [3, 4]]],
+          'expected': [[[1, 2], [3, 4]]],
+      },
+      {
+          'tensor': [[[1, 2]], [[3, 4]], [[5, 6]], [[7, 8]]],
+          'expected': [[[1, 2]], [[3, 4]], [[5, 6]], [[7, 8]]],
+      },
+      {
+          'tensor': [[[1], [2]], [[3], [4]], [[5], [6]], [[7], [8]]],
+          'expected': [[[1], [2]], [[3], [4]], [[5], [6]], [[7], [8]]],
+      },
+      # 2D test cases, with length
+      {
+          'tensor': [[1]],
+          'lengths': [1],
+          'expected': [[1]]
+      },
+      {
+          'tensor': [[1]],
+          'lengths': [0],
+          'expected': [[]]
+      },
+      {
+          'tensor': [[1, 2, 3], [4, 5, 6], [7, 8, 9]],
+          'lengths': [0, 1, 2],
+          'expected': [[], [4], [7, 8]]
+      },
+      {
+          'tensor': [[1, 2, 3], [4, 5, 6], [7, 8, 9]],
+          'lengths': [0, 0, 0],
+          'expected': [[], [], []]
+      },
+      {
+          'tensor': [[1, 2], [3, 4]],
+          'lengths': [2, 2],
+          'expected': [[1, 2], [3, 4]]
+      },
+      {
+          'tensor': [[1, 2], [3, 4]],
+          'lengths': [7, 8],  # lengths > ncols: truncated to ncols
+          'expected': [[1, 2], [3, 4]]
+      },
+      {
+          'tensor': [[1, 2], [3, 4]],
+          'lengths': [-2, -1],  # lengths < 0: treated as zero
+          'expected': [[], []]
+      },
+      # 3D test cases, with length
+      {
+          'tensor': [[[1, 2], [3, 4]], [[5, 6], [7, 8]]],
+          'lengths': [0, 0],
+          'expected': [[], []]
+      },
+      {
+          'tensor': [[[1, 2], [3, 4]], [[5, 6], [7, 8]]],
+          'lengths': [1, 2],
+          'expected': [[[1, 2]], [[5, 6], [7, 8]]]
+      },
+      {
+          'tensor': [[[1, 2], [3, 4]], [[5, 6], [7, 8]]],
+          'lengths': [2, 2],
+          'expected': [[[1, 2], [3, 4]], [[5, 6], [7, 8]]]
+      },
+      # 2D test cases, with padding
+      {
+          'tensor': [[1]],
+          'padding': 0,
+          'expected': [[1]]
+      },
+      {
+          'tensor': [[0]],
+          'padding': 0,
+          'expected': [[]]
+      },
+      {
+          'tensor': [[0, 1]],
+          'padding': 0,
+          'expected': [[0, 1]]
+      },
+      {
+          'tensor': [[1, 0]],
+          'padding': 0,
+          'expected': [[1]]
+      },
+      {
+          'tensor': [[1, 0, 1, 0, 0, 1, 0, 0]],
+          'padding': 0,
+          'expected': [[1, 0, 1, 0, 0, 1]]
+      },
+      {
+          'tensor': [[3, 7, 0, 0], [2, 0, 0, 0], [5, 0, 0, 0]],
+          'padding': 0,
+          'expected': [[3, 7], [2], [5]]
+      },
+      # 3D test cases, with padding
+      {
+          'tensor': [[[1]]],
+          'padding': [0],
+          'expected': [[[1]]]
+      },
+      {
+          'tensor': [[[0]]],
+          'padding': [0],
+          'expected': [[]]
+      },
+      {
+          'tensor': [[[0, 0], [1, 2]], [[3, 4], [0, 0]]],
+          'padding': [0, 0],
+          'expected': [[[0, 0], [1, 2]], [[3, 4]]]
+      },
+      # 4D test cases, with padding
+      {
+          'tensor': [
+              [[[1, 2], [3, 4]], [[0, 0], [0, 0]], [[0, 0], [0, 0]]],
+              [[[0, 0], [0, 0]], [[5, 6], [7, 8]], [[0, 0], [0, 0]]],
+              [[[0, 0], [0, 0]], [[0, 0], [0, 0]], [[0, 0], [0, 0]]]
+          ],
+          'padding': [[0, 0], [0, 0]],
+          'expected': [
+              [[[1, 2], [3, 4]]],
+              [[[0, 0], [0, 0]], [[5, 6], [7, 8]]],
+              []
+          ]
+      },
+      # 3D test cases, with ragged_rank=2.
+      {
+          'tensor': [[[1, 0], [2, 3]], [[0, 0], [4, 0]]],
+          'ragged_rank': 2,
+          'expected': [[[1, 0], [2, 3]], [[0, 0], [4, 0]]]
+      },
+      {
+          'tensor': [[[1, 2], [3, 4]], [[5, 6], [7, 8]]],
+          'ragged_rank': 2,
+          'lengths': [2, 0, 2, 1],
+          'expected': [[[1, 2], []], [[5, 6], [7]]]
+      },
+      {
+          'tensor': [[[1, 0], [2, 3]], [[0, 0], [4, 0]]],
+          'ragged_rank': 2,
+          'padding': 0,
+          'expected': [[[1], [2, 3]], [[], [4]]]
+      },
+      # 4D test cases, with ragged_rank>1
+      {
+          'tensor': [[[[1, 0], [2, 3]], [[0, 0], [4, 0]]],
+                     [[[5, 6], [7, 0]], [[0, 8], [0, 0]]]],
+          'ragged_rank': 2,
+          'expected': [[[[1, 0], [2, 3]], [[0, 0], [4, 0]]],
+                       [[[5, 6], [7, 0]], [[0, 8], [0, 0]]]]
+      },
+      {
+          'tensor': [[[[1, 0], [2, 3]], [[0, 0], [4, 0]]],
+                     [[[5, 6], [7, 0]], [[0, 8], [0, 0]]]],
+          'ragged_rank': 3,
+          'expected': [[[[1, 0], [2, 3]], [[0, 0], [4, 0]]],
+                       [[[5, 6], [7, 0]], [[0, 8], [0, 0]]]]
+      },
+      {
+          'tensor': [[[[1, 0], [2, 3]], [[0, 0], [4, 0]]],
+                     [[[5, 6], [7, 0]], [[0, 8], [0, 0]]]],
+          'ragged_rank': 2,
+          'padding': [0, 0],
+          'expected': [[[[1, 0], [2, 3]], [[0, 0], [4, 0]]],
+                       [[[5, 6], [7, 0]], [[0, 8]]]]
+      },
+      {
+          'tensor': [[[[1, 0], [2, 3]], [[0, 0], [4, 0]]],
+                     [[[5, 6], [7, 0]], [[0, 8], [0, 0]]]],
+          'ragged_rank': 3,
+          'padding': 0,
+          'expected': [[[[1], [2, 3]], [[], [4]]],
+                       [[[5, 6], [7]], [[0, 8], []]]]
+      },
+  )  # pyformat: disable
+  def testRaggedFromTensor(self,
+                           tensor,
+                           expected,
+                           lengths=None,
+                           padding=None,
+                           ragged_rank=1):
+    dt = constant_op.constant(tensor)
+    rt = ragged.from_tensor(dt, lengths, padding, ragged_rank)
+    self.assertEqual(type(rt), ragged.RaggedTensor)
+    self.assertEqual(rt.ragged_rank, ragged_rank)
+    self.assertTrue(
+        dt.shape.is_compatible_with(rt.shape),
+        '%s is incompatible with %s' % (dt.shape, rt.shape))
+    with self.test_session():
+      self.assertEqual(rt.eval().tolist(), expected)
+
+  def testHighDimensions(self):
+    # Use distinct prime numbers for all dimension shapes in this test, so
+    # we can see any errors that are caused by mixing up dimension sizes.
+    dt = array_ops.reshape(
+        math_ops.range(3 * 5 * 7 * 11 * 13 * 17), [3, 5, 7, 11, 13, 17])
+    for ragged_rank in range(1, 4):
+      rt = ragged.from_tensor(dt, ragged_rank=ragged_rank)
+      self.assertEqual(type(rt), ragged.RaggedTensor)
+      self.assertEqual(rt.ragged_rank, ragged_rank)
+      self.assertTrue(
+          dt.shape.is_compatible_with(rt.shape),
+          '%s is incompatible with %s' % (dt.shape, rt.shape))
+      with self.test_session():
+        self.assertEqual(rt.eval().tolist(), dt.eval().tolist())
+
+  @parameterized.parameters(
+      # With no padding or lengths
+      {
+          'dt_shape': [0, 0],
+          'expected': []
+      },
+      {
+          'dt_shape': [0, 3],
+          'expected': []
+      },
+      {
+          'dt_shape': [3, 0],
+          'expected': [[], [], []]
+      },
+      {
+          'dt_shape': [0, 2, 3],
+          'expected': []
+      },
+      {
+          'dt_shape': [2, 0, 3],
+          'expected': [[], []]
+      },
+      {
+          'dt_shape': [2, 3, 0],
+          'expected': [[[], [], []], [[], [], []]]
+      },
+      {
+          'dt_shape': [2, 3, 0, 1],
+          'expected': [[[], [], []], [[], [], []]]
+      },
+      {
+          'dt_shape': [2, 3, 1, 0],
+          'expected': [[[[]], [[]], [[]]], [[[]], [[]], [[]]]]
+      },
+      # With padding
+      {
+          'dt_shape': [0, 0],
+          'padding': 0,
+          'expected': []
+      },
+      {
+          'dt_shape': [0, 3],
+          'padding': 0,
+          'expected': []
+      },
+      {
+          'dt_shape': [3, 0],
+          'padding': 0,
+          'expected': [[], [], []]
+      },
+      {
+          'dt_shape': [0, 2, 3],
+          'padding': [0, 0, 0],
+          'expected': []
+      },
+      {
+          'dt_shape': [2, 0, 3],
+          'padding': [0, 0, 0],
+          'expected': [[], []]
+      },
+      {
+          'dt_shape': [2, 3, 0],
+          'padding': [],
+          'expected': [[], []]
+      },
+      # With lengths
+      {
+          'dt_shape': [0, 0],
+          'lengths': [],
+          'expected': []
+      },
+      {
+          'dt_shape': [0, 3],
+          'lengths': [],
+          'expected': []
+      },
+      {
+          'dt_shape': [3, 0],
+          'lengths': [0, 0, 0],
+          'expected': [[], [], []]
+      },
+      {
+          'dt_shape': [3, 0],
+          'lengths': [2, 3, 4],  # lengths > ncols: truncated to ncols
+          'expected': [[], [], []]
+      },
+      {
+          'dt_shape': [0, 2, 3],
+          'lengths': [],
+          'expected': []
+      },
+      {
+          'dt_shape': [2, 0, 3],
+          'lengths': [0, 0],
+          'expected': [[], []]
+      },
+      {
+          'dt_shape': [2, 3, 0],
+          'lengths': [0, 0],
+          'expected': [[], []]
+      },
+  )
+  def testEmpty(self, dt_shape, expected, lengths=None, padding=None):
+    dt = array_ops.zeros(dt_shape)
+    rt = ragged.from_tensor(dt, lengths, padding)
+    self.assertEqual(type(rt), ragged.RaggedTensor)
+    self.assertEqual(rt.ragged_rank, 1)
+    self.assertTrue(dt.shape.is_compatible_with(rt.shape))
+    with self.test_session():
+      self.assertEqual(rt.eval().tolist(), expected)
+
+  @parameterized.parameters(
+      {
+          'tensor': [[1]],
+          'lengths': [0],
+          'padding': 0,
+          'error': (ValueError, 'Specify lengths or padding, but not both')
+      },
+      {
+          'tensor': [[1]],
+          'lengths': [0.5],
+          'error': (TypeError, 'lengths must be an integer tensor')
+      },
+      {
+          'tensor': [[1]],
+          'padding': 'a',
+          'error': (TypeError, "Expected int32, got 'a'.*")
+      },
+      {
+          'tensor': [[1]],
+          'padding': [1],
+          'error': (ValueError, r'Shapes \(1,\) and \(\) are incompatible')
+      },
+      {
+          'tensor': [[[1]]],
+          'padding': 1,
+          'error': (ValueError, r'Shapes \(\) and \(1,\) are incompatible')
+      },
+      {
+          'tensor': [[1]],
+          'ragged_rank': 'bad',
+          'error': (TypeError, r'ragged_rank expected int, got \'bad\'')
+      },
+      {
+          'tensor': [[1]],
+          'ragged_rank': 0,
+          'error': (ValueError, r'ragged_rank must be greater than 0; got 0')
+      },
+      {
+          'tensor': [[1]],
+          'ragged_rank': -1,
+          'error': (ValueError, r'ragged_rank must be greater than 0; got -1')
+      },
+  )
+  def testErrors(self,
+                 tensor,
+                 lengths=None,
+                 padding=None,
+                 ragged_rank=1,
+                 error=None):
+    dt = constant_op.constant(tensor)
+    self.assertRaisesRegexp(error[0], error[1], ragged.from_tensor, dt, lengths,
+                            padding, ragged_rank)
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/python/ops/ragged/ragged_functional_ops.py b/tensorflow/python/ops/ragged/ragged_functional_ops.py
new file mode 100644
index 00000000000..6b71d88435c
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_functional_ops.py
@@ -0,0 +1,115 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Support for ragged tensors."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import ops
+from tensorflow.python.ops.ragged import ragged_factory_ops
+from tensorflow.python.ops.ragged import ragged_tensor
+from tensorflow.python.ops.ragged import ragged_util
+
+
+def map_inner_values(op, *args, **kwargs):
+  """Applies `op` to the inner values of one or more RaggedTensors.
+
+  Replaces any `RaggedTensor` in `args` or `kwargs` with its `inner_values`
+  tensor, and then calls `op`.  Returns a `RaggedTensor` that is constructed
+  from the input `RaggedTensor`s' `splits` and the value returned by
+  the `op`.
+
+  If the input arguments contain multiple `RaggedTensor`s, then they must have
+  identical `splits`.
+
+  Examples:
+
+  ```python
+  >>> rt = ragged.constant([[1, 2, 3], [], [4, 5], [6]])
+  >>> ragged.map_inner_values(tf.ones_like, rt).eval().tolist()
+  [[1, 1, 1], [], [1, 1], [1]]
+  >>> ragged.map_inner_values(tf.multiply, rt, rt).eval().tolist()
+  [[1, 4, 9], [], [16, 25], [36]]
+  >>> ragged.map_inner_values(tf.add, rt, 5).eval().tolist()
+  [[6, 7, 8], [], [9, 10], [11]]
+  ```
+
+  Args:
+    op: The operation that should be applied to the RaggedTensor `inner_values`.
+      `op` is typically an element-wise operation (such as math_ops.add), but
+      any operation that preserves the size of the outermost dimension can be
+      used.  I.e., `shape[0]` of the value returned by `op` must match
+      `shape[0]` of the `RaggedTensor`s' `inner_values` tensors.
+    *args: Arguments for `op`.
+    **kwargs: Keyword arguments for `op`.
+
+  Returns:
+    A `RaggedTensor` whose `ragged_rank` matches the `ragged_rank` of all
+    input `RaggedTensor`s.
+  Raises:
+    ValueError: If args contains no `RaggedTensors`, or if the `nested_splits`
+      of the input `RaggedTensor`s are not identical.
+  """
+  # Replace RaggedTensors with their values; and collect the splits tensors
+  # from each RaggedTensor.
+  nested_splits_lists = []
+  inner_args = _replace_ragged_with_inner_values(args, nested_splits_lists)
+  inner_kwargs = _replace_ragged_with_inner_values(kwargs, nested_splits_lists)
+  if not nested_splits_lists:
+    return op(*args, **kwargs)
+
+  with ops.control_dependencies(
+      ragged_util.assert_splits_match(nested_splits_lists)):
+    # Delegate to op, and then compose the result from the transformed values
+    # and the splits.
+    return ragged_factory_ops.from_nested_row_splits(
+        op(*inner_args, **inner_kwargs), nested_splits_lists[0])
+
+
+def _replace_ragged_with_inner_values(value, nested_splits_lists):
+  """Replace RaggedTensors with their inner_values, and record their splits.
+
+  Returns a copy of `value`, with any nested `RaggedTensor`s replaced by their
+  `inner_values` tensor.  Looks inside lists, tuples, and dicts.
+
+  Appends each `RaggedTensor`'s `nested_splits` to `nested_splits_lists`.
+
+  Args:
+    value: The value that should be transformed by replacing `RaggedTensors`.
+    nested_splits_lists: An output parameter used to record the `nested_splits`
+      for any `RaggedTensors` that were replaced.
+
+  Returns:
+    A copy of `value` with nested `RaggedTensors` replaced by their `values`.
+  """
+  # Base case
+  if ragged_tensor.is_ragged(value):
+    value = ragged_factory_ops.convert_to_tensor_or_ragged_tensor(value)
+    nested_splits_lists.append(value.nested_row_splits)
+    return value.inner_values
+
+  # Recursion cases
+  def recurse(v):
+    return _replace_ragged_with_inner_values(v, nested_splits_lists)
+
+  if isinstance(value, list):
+    return [recurse(v) for v in value]
+  elif isinstance(value, tuple):
+    return tuple(recurse(v) for v in value)
+  elif isinstance(value, dict):
+    return dict((k, recurse(v)) for (k, v) in value.items())
+  else:
+    return value
diff --git a/tensorflow/python/ops/ragged/ragged_gather_nd_op_test.py b/tensorflow/python/ops/ragged/ragged_gather_nd_op_test.py
new file mode 100644
index 00000000000..dcf1feaa696
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_gather_nd_op_test.py
@@ -0,0 +1,232 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for tf.ragged.gather_nd."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import ragged
+from tensorflow.python.platform import googletest
+
+
+class RaggedGatherNdOpTest(test_util.TensorFlowTestCase,
+                           parameterized.TestCase):
+
+  DOCSTRING_PARAMS = [[['000', '001'], ['010']],
+                      [['100'], ['110', '111', '112'], ['120']],
+                      [[], ['210']]]  # pyformat: disable
+
+  @parameterized.parameters([
+      #=========================================================================
+      # Docstring Examples
+      #=========================================================================
+      dict(
+          descr='Docstring example 1',
+          params=ragged.constant_value(DOCSTRING_PARAMS),
+          indices=[[2], [0]],
+          expected=ragged.constant_value([[[], [b'210']],
+                                          [[b'000', b'001'], [b'010']]])),
+      dict(
+          descr='Docstring example 2',
+          params=ragged.constant_value(DOCSTRING_PARAMS),
+          indices=[[2, 1], [0, 0]],
+          expected=ragged.constant_value([[b'210'], [b'000', b'001']])),
+      dict(
+          descr='Docstring example 3',
+          params=ragged.constant_value(DOCSTRING_PARAMS),
+          indices=[[0, 0, 1], [1, 1, 2]],
+          expected=[b'001', b'112']),
+      #=========================================================================
+      # Indices with 0 values (selects the entire params)
+      #=========================================================================
+      dict(
+          descr='params: [B1, (B2)], indices: [0], result: [B1, (B2)]',
+          params=ragged.constant_value([['a', 'b', 'c'], ['d']]),
+          indices=np.zeros([0], dtype=np.int32),
+          expected=ragged.constant_value([[b'a', b'b', b'c'], [b'd']])),
+      dict(
+          descr='params: [B1, (B2)], indices: [A1, 0], result: [A1, B1, (B2)]',
+          params=ragged.constant_value([['a', 'b', 'c'], ['d']]),
+          indices=np.zeros([3, 0], dtype=np.int32),
+          expected=ragged.constant_value([[[b'a', b'b', b'c'], [b'd']],
+                                          [[b'a', b'b', b'c'], [b'd']],
+                                          [[b'a', b'b', b'c'], [b'd']]])),
+      dict(
+          descr=('params: [B1, (B2)], indices: [A1, A2, 0], '
+                 'result: [A1, A2, B1, (B2)]'),
+          params=ragged.constant_value([['a', 'b', 'c'], ['d']]),
+          indices=np.zeros([1, 3, 0], dtype=np.int32),
+          expected=ragged.constant_value([[[[b'a', b'b', b'c'], [b'd']],
+                                           [[b'a', b'b', b'c'], [b'd']],
+                                           [[b'a', b'b', b'c'], [b'd']]]])),
+      dict(
+          descr='params: [B1], indices: [A1, (A2), 0], result: [A1, (A2), B1]',
+          params=['a'],
+          indices=ragged.constant_value([[[], []], [[]]],
+                                        ragged_rank=1,
+                                        dtype=np.int32),
+          expected=ragged.constant_value([[[b'a'], [b'a']], [[b'a']]],
+                                         ragged_rank=1)),
+      #=========================================================================
+      # Indices with 1 value (selects row from params)
+      #=========================================================================
+      dict(
+          descr='params: [B1, (B2)], indices: [A1, 1], result: [A1, (B2)]',
+          params=ragged.constant_value([['a', 'b', 'c'], ['d']]),
+          indices=[[1], [0]],
+          expected=ragged.constant_value([[b'd'], [b'a', b'b', b'c']])),
+      dict(
+          descr=('params: [B1, (B2), (B3)], indices: [A1, 1], '
+                 'result: [A1, (B2), (B3)]'),
+          params=ragged.constant_value([[['a', 'b', 'c'], ['d']],
+                                        [['e', 'f']]]),
+          indices=[[1], [1]],
+          expected=ragged.constant_value([[[b'e', b'f']], [[b'e', b'f']]])),
+      dict(
+          descr=('params: [B1, B2, B3], indices: [A1, (A2), 1], '
+                 'result: [A1, (A2), B2, B3]'),
+          params=[[['a']], [['b']]],
+          indices=ragged.constant_value([[[0]]], ragged_rank=1),
+          expected=ragged.constant_value([[[[b'a']]]], ragged_rank=1)),
+      #=========================================================================
+      # Indices with 2 values (selects row & col from params)
+      #=========================================================================
+      dict(
+          descr='params: [B1, (B2)], indices: [A1, 2], result: [A1]',
+          params=ragged.constant_value([['a', 'b', 'c'], ['d']]),
+          indices=[[1, 0], [0, 0], [0, 2]],
+          expected=ragged.constant_value([b'd', b'a', b'c'])),
+      dict(
+          descr=('params: [B1, (B2), (B3)], indices: [A1, 2], '
+                 'result: [A1, (B3)]'),
+          params=ragged.constant_value([[['a', 'b', 'c'], ['d']],
+                                        [['e', 'f']]]),
+          indices=[[1, 0], [0, 1], [0, 0]],
+          expected=ragged.constant_value([[b'e', b'f'], [b'd'],
+                                          [b'a', b'b', b'c']])),
+      dict(
+          descr=('params: [B1, (B2), (B3)], indices: [A1, A2, 2], '
+                 'result: [A1, (A2), (B3)]'),
+          params=ragged.constant_value([[['a', 'b', 'c'], ['d']],
+                                        [['e', 'f']]]),
+          indices=[[[1, 0], [0, 1], [0, 0]]],
+          expected=ragged.constant_value([[[b'e', b'f'], [b'd'],
+                                           [b'a', b'b', b'c']]])),
+      dict(
+          descr=('params: [B1, (B2), B3], indices: [A1, A2, 2], '
+                 'result: [A1, A2, B3]'),
+          params=ragged.constant_value([[['a', 'b'], ['c', 'd']],
+                                        [['e', 'f']]],
+                                       ragged_rank=1),
+          indices=[[[1, 0], [0, 1], [0, 0]]],
+          expected=[[[b'e', b'f'], [b'c', b'd'], [b'a', b'b']]]),
+      dict(
+          descr=('params: [B1, (B2), B3], indices: [A1, A2, A3, 2], '
+                 'result: [A1, A2, A3, B3]'),
+          params=ragged.constant_value([[['a', 'b'], ['c', 'd']],
+                                        [['e', 'f']]],
+                                       ragged_rank=1),
+          indices=[[[[1, 0], [0, 1], [0, 0]]]],
+          expected=[[[[b'e', b'f'], [b'c', b'd'], [b'a', b'b']]]]),
+      dict(
+          descr=('params: [B1, (B2), (B3)], indices: [A1, (A2), 2], '
+                 'result: [A1, (A2), (B3)]'),
+          params=ragged.constant_value([[['a', 'b', 'c'], ['d']],
+                                        [['e', 'f']]]),
+          indices=ragged.constant_value([[[1, 0], [0, 1]], [[0, 0]]],
+                                        ragged_rank=1),
+          expected=ragged.constant_value([[[b'e', b'f'], [b'd']],
+                                          [[b'a', b'b', b'c']]])),
+      #=========================================================================
+      # Indices with 3 values
+      #=========================================================================
+      dict(
+          descr=('params: [B1, (B2), (B3)], indices: [A1, 3], '
+                 'result: [A1]'),
+          params=ragged.constant_value([[['a', 'b', 'c'], ['d']],
+                                        [['e', 'f']]]),
+          indices=[[1, 0, 1], [0, 0, 0], [0, 1, 0]],
+          expected=[b'f', b'a', b'd']),
+      dict(
+          descr=('params: [B1, (B2), B3], indices: [A1, 3], '
+                 'result: [A1]'),
+          params=ragged.constant_value([[['a', 'b'], ['c', 'd']],
+                                        [['e', 'f']]],
+                                       ragged_rank=1),
+          indices=[[1, 0, 1], [0, 0, 0], [0, 1, 1]],
+          expected=[b'f', b'a', b'd']),
+      dict(
+          descr=('params: [B1, (B2), (B3), B4], indices: [A1, 3], '
+                 'result: [A1, B4]'),
+          params=ragged.constant_value([[[['a', 'b'], ['c', 'd']],
+                                         [['e', 'f']]]],
+                                       ragged_rank=2),
+          indices=[[0, 0, 1], [0, 0, 0], [0, 1, 0]],
+          expected=[[b'c', b'd'], [b'a', b'b'], [b'e', b'f']]),
+  ])  # pyformat: disable
+  def testRaggedGatherNd(self, descr, params, indices, expected):
+    result = ragged.gather_nd(params, indices)
+    self.assertEqual(
+        getattr(result, 'ragged_rank', 0), getattr(expected, 'ragged_rank', 0))
+    with self.test_session() as sess:
+      if hasattr(expected, 'tolist'):
+        expected = expected.tolist()
+      self.assertEqual(sess.run(result).tolist(), expected)
+
+  def testRaggedGatherNdUnknownRankError(self):
+    params = ragged.constant([['a', 'b'], ['c', 'd']])
+    indices1 = array_ops.placeholder(dtypes.int32, shape=None)
+    indices2 = array_ops.placeholder(dtypes.int32, shape=[None])
+
+    with self.assertRaisesRegexp(ValueError,
+                                 'indices.rank be statically known.'):
+      ragged.gather_nd(params, indices1)
+    with self.assertRaisesRegexp(
+        ValueError, r'indices.shape\[-1\] must be statically known.'):
+      ragged.gather_nd(params, indices2)
+
+  @parameterized.parameters([
+      dict(
+          params=['a'],
+          indices=0,
+          message='Shape must be at least rank 1 but is rank 0'
+          " for 'GatherNd'"),
+      dict(
+          params=ragged.constant_value([['a']]),
+          indices=0,
+          message='indices.rank must be at least 1.'),
+      dict(
+          params=['a', 'b', 'c'],
+          indices=ragged.constant([[0]]),
+          message='The innermost dimension of indices may not be ragged'),
+  ])
+  def testRaggedGatherNdStaticError(self,
+                                    params,
+                                    indices,
+                                    message,
+                                    error=ValueError):
+    with self.assertRaisesRegexp(error, message):
+      ragged.gather_nd(params, indices)
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/python/ops/ragged/ragged_gather_op_test.py b/tensorflow/python/ops/ragged/ragged_gather_op_test.py
new file mode 100644
index 00000000000..bb52d05c32e
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_gather_op_test.py
@@ -0,0 +1,144 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for ragged.gather."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import ragged
+from tensorflow.python.platform import googletest
+
+
+class RaggedTensorOpsTest(test_util.TensorFlowTestCase):
+
+  def testDocStringExamples(self):
+    params = constant_op.constant(['a', 'b', 'c', 'd', 'e'])
+    indices = constant_op.constant([3, 1, 2, 1, 0])
+    ragged_params = ragged.constant([['a', 'b', 'c'], ['d'], [], ['e']])
+    ragged_indices = ragged.constant([[3, 1, 2], [1], [], [0]])
+    with self.test_session():
+      self.assertEqual(
+          ragged.gather(params, ragged_indices).eval().tolist(),
+          [[b'd', b'b', b'c'], [b'b'], [], [b'a']])
+      self.assertEqual(
+          ragged.gather(ragged_params, indices).eval().tolist(),
+          [[b'e'], [b'd'], [], [b'd'], [b'a', b'b', b'c']])
+      self.assertEqual(
+          ragged.gather(ragged_params, ragged_indices).eval().tolist(),
+          [[[b'e'], [b'd'], []], [[b'd']], [], [[b'a', b'b', b'c']]])
+
+  def testTensorParamsAndTensorIndices(self):
+    params = ['a', 'b', 'c', 'd', 'e']
+    indices = [2, 0, 2, 1]
+    with self.test_session():
+      self.assertEqual(
+          ragged.gather(params, indices).eval().tolist(),
+          [b'c', b'a', b'c', b'b'])
+      self.assertEqual(type(ragged.gather(params, indices)), ops.Tensor)
+
+  def testRaggedParamsAndTensorIndices(self):
+    params = ragged.constant([['a', 'b'], ['c', 'd', 'e'], ['f'], [], ['g']])
+    indices = [2, 0, 2, 1]
+    with self.test_session():
+      self.assertEqual(
+          ragged.gather(params, indices).eval().tolist(),
+          [[b'f'], [b'a', b'b'], [b'f'], [b'c', b'd', b'e']])
+
+  def testTensorParamsAndRaggedIndices(self):
+    params = ['a', 'b', 'c', 'd', 'e']
+    indices = ragged.constant([[2, 1], [1, 2, 0], [3]])
+    with self.test_session():
+      self.assertEqual(
+          ragged.gather(params, indices).eval().tolist(),
+          [[b'c', b'b'], [b'b', b'c', b'a'], [b'd']])
+
+  def testRaggedParamsAndRaggedIndices(self):
+    params = ragged.constant([['a', 'b'], ['c', 'd', 'e'], ['f'], [], ['g']])
+    indices = ragged.constant([[2, 1], [1, 2, 0], [3]])
+    with self.test_session():
+      self.assertEqual(
+          ragged.gather(params, indices).eval().tolist(),
+          [[[b'f'], [b'c', b'd', b'e']],                # [[p[2], p[1]      ],
+           [[b'c', b'd', b'e'], [b'f'], [b'a', b'b']],  #  [p[1], p[2], p[0]],
+           [[]]]                                        #  [p[3]            ]]
+      )  # pyformat: disable
+
+  def testRaggedParamsAndScalarIndices(self):
+    params = ragged.constant([['a', 'b'], ['c', 'd', 'e'], ['f'], [], ['g']])
+    indices = 1
+    with self.test_session():
+      self.assertEqual(
+          ragged.gather(params, indices).eval().tolist(), [b'c', b'd', b'e'])
+
+  def test3DRaggedParamsAnd2DTensorIndices(self):
+    params = ragged.constant([[['a', 'b'], []], [['c', 'd'], ['e'], ['f']],
+                              [['g']]])
+    indices = [[1, 2], [0, 1], [2, 2]]
+    with self.test_session():
+      self.assertEqual(
+          ragged.gather(params, indices).eval().tolist(),
+          [[[[b'c', b'd'], [b'e'], [b'f']], [[b'g']]],            # [[p1, p2],
+           [[[b'a', b'b'], []], [[b'c', b'd'], [b'e'], [b'f']]],  #  [p0, p1],
+           [[[b'g']], [[b'g']]]]                                  #  [p2, p2]]
+      )  # pyformat: disable
+
+  def testTensorParamsAnd4DRaggedIndices(self):
+    indices = ragged.constant(
+        [[[[3, 4], [0, 6]], []], [[[2, 1], [1, 0]], [[2, 5]], [[2, 3]]],
+         [[[1, 0]]]],  # pyformat: disable
+        ragged_rank=2,
+        inner_shape=(2,))
+    params = ['a', 'b', 'c', 'd', 'e', 'f', 'g']
+    with self.test_session():
+      self.assertEqual(
+          ragged.gather(params, indices).eval().tolist(),
+          [[[[b'd', b'e'], [b'a', b'g']], []],
+           [[[b'c', b'b'], [b'b', b'a']], [[b'c', b'f']], [[b'c', b'd']]],
+           [[[b'b', b'a']]]])  # pyformat: disable
+
+  def testOutOfBoundsError(self):
+    tensor_params = ['a', 'b', 'c']
+    tensor_indices = [0, 1, 2]
+    ragged_params = ragged.constant([['a', 'b'], ['c']])
+    ragged_indices = ragged.constant([[0, 3]])
+    with self.test_session():
+      self.assertRaisesRegexp(errors.InvalidArgumentError,
+                              r'indices\[1\] = 3 is not in \[0, 3\)',
+                              ragged.gather(tensor_params, ragged_indices).eval)
+      self.assertRaisesRegexp(errors.InvalidArgumentError,
+                              r'indices\[2\] = 2 is not in \[0, 2\)',
+                              ragged.gather(ragged_params, tensor_indices).eval)
+      self.assertRaisesRegexp(errors.InvalidArgumentError,
+                              r'indices\[1\] = 3 is not in \[0, 2\)',
+                              ragged.gather(ragged_params, ragged_indices).eval)
+
+  def testUnknownIndicesRankError(self):
+    params = ragged.constant([], ragged_rank=1)
+    indices = constant_op.constant([0], dtype=dtypes.int64)
+    indices = array_ops.placeholder_with_default(indices, None)
+    self.assertRaisesRegexp(ValueError,
+                            r'indices\.shape\.ndims must be known statically',
+                            ragged.gather, params, indices)
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/python/ops/ragged/ragged_getitem.py b/tensorflow/python/ops/ragged/ragged_getitem.py
new file mode 100644
index 00000000000..9821695046c
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_getitem.py
@@ -0,0 +1,388 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Python-style indexing and slicing for RaggedTensors."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.ragged import ragged_array_ops
+from tensorflow.python.ops.ragged import ragged_factory_ops
+from tensorflow.python.ops.ragged import ragged_math_ops
+from tensorflow.python.ops.ragged import ragged_tensor
+
+
+def ragged_tensor_getitem(self, key):
+  """Returns the specified piece of this RaggedTensor.
+
+  Supports multidimensional indexing and slicing, with one restriction:
+  indexing into a ragged inner dimension is not allowed.  This case is
+  problematic because the indicated value may exist in some rows but not
+  others.  In such cases, it's not obvious whether we should (1) report an
+  IndexError; (2) use a default value; or (3) skip that value and return a
+  tensor with fewer rows than we started with.  Following the guiding
+  principles of Python ("In the face of ambiguity, refuse the temptation to
+  guess" <go/pep20>), we simply disallow this operation.
+
+  Any dimensions added by `array_ops.newaxis` will be ragged if the following
+  dimension is ragged.
+
+  Args:
+    self: The RaggedTensor to slice.
+    key: Indicates which piece of the RaggedTensor to return, using standard
+      Python semantics (e.g., negative values index from the end).  `key`
+      may have any of the following types:
+
+      * `int` constant
+      * Scalar integer `Tensor`
+      * `slice` containing integer constants and/or scalar integer
+        `Tensor`s
+      * `Ellipsis`
+      * `tf.newaxis`
+      * `tuple` containing any of the above (for multidimentional indexing)
+
+  Returns:
+    A `Tensor` or `RaggedTensor` object.  Values that include at least one
+    ragged dimension are returned as `RaggedTensor`.  Values that include no
+    ragged dimensions are returned as `Tensor`.  See above for examples of
+    expressions that return `Tensor`s vs `RaggedTensor`s.
+
+  Raises:
+    ValueError: If `key` is out of bounds.
+    ValueError: If `key` is not supported.
+    TypeError: If the indices in `key` have an unsupported type.
+
+  Examples:
+
+    ```python
+    >>> # A 2-D ragged tensor with 1 ragged dimension.
+    >>> rt = ragged.constant([['a', 'b', 'c'], ['d', 'e'], ['f'], ['g']])
+    >>> rt[0].eval().tolist()       # First row (1-D `Tensor`)
+    ['a', 'b', 'c']
+    >>> rt[:3].eval().tolist()      # First three rows (2-D RaggedTensor)
+    [['a', 'b', 'c'], ['d', 'e'], '[f'], [g']]
+    >>> rt[3, 0].eval().tolist()    # 1st element of 4th row (scalar)
+    'g'
+
+    >>> # A 3-D ragged tensor with 2 ragged dimensions.
+    >>> rt = ragged.constant([[[1, 2, 3], [4]],
+    ...                    [[5], [], [6]],
+    ...                    [[7]],
+    ...                    [[8, 9], [10]]])
+    >>> rt[1].eval().tolist()       # Second row (2-D RaggedTensor)
+    [[5], [], [6]]
+    >>> rt[3, 0].eval().tolist()    # First element of fourth row (1-D Tensor)
+    [8, 9]
+    >>> rt[:, 1:3].eval().tolist()  # Items 1-3 of each row (3-D RaggedTensor)
+    [[[4]], [[], [6]], [], [[10]]]
+    >>> rt[:, -1:].eval().tolist()  # Last item of each row (3-D RaggedTensor)
+    [[[4]], [[6]], [[7]], [[10]]]
+    ```
+  """
+  scope_tensors = [self] + list(_tensors_in_key_list(key))
+  if isinstance(key, (list, tuple)):
+    key = list(key)
+  else:
+    key = [key]
+  with ops.name_scope(None, "RaggedGetItem", scope_tensors):
+    return _ragged_getitem(self, key)
+
+
+def _ragged_getitem(rt_input, key_list):
+  """Helper for indexing and slicing ragged tensors with __getitem__().
+
+  Extracts the specified piece of the `rt_input`.  See
+  `RaggedTensor.__getitem__` for examples and restrictions.
+
+  Args:
+    rt_input: The `RaggedTensor` from which a piece should be returned.
+    key_list: The list of keys specifying which piece to return. Each key
+      corresponds with a separate dimension.
+
+  Returns:
+    The indicated piece of rt_input.
+
+  Raises:
+    ValueError: If `key_list` is not supported.
+    TypeError: If any keys in `key_list` have an unsupported type.
+  """
+  if not key_list:
+    return rt_input
+  row_key = key_list[0]
+  inner_keys = key_list[1:]
+
+  if row_key is Ellipsis:
+    expanded_key_list = _expand_ellipsis(key_list, rt_input.shape.ndims)
+    return _ragged_getitem(rt_input, expanded_key_list)
+
+  # Adding a new axis: Get rt_input[inner_keys], and wrap it in a RaggedTensor
+  # that puts all values in a single row.
+  if row_key is array_ops.newaxis:
+    inner_rt = _ragged_getitem(rt_input, inner_keys)
+    nsplits = array_ops.shape(inner_rt.row_splits, out_type=dtypes.int64)[0]
+    return ragged_factory_ops.from_row_splits(inner_rt,
+                                              array_ops.stack([0, nsplits - 1]))
+
+  # Slicing a range of rows: first slice the outer dimension, and then
+  # call `_ragged_getitem_inner_dimensions` to handle the inner keys.
+  if isinstance(row_key, slice):
+    sliced_rt_input = _slice_ragged_row_dimension(rt_input, row_key)
+    return _ragged_getitem_inner_dimensions(sliced_rt_input, inner_keys)
+
+  # Indexing a single row: slice values to get the indicated row, and then
+  # use a recursive call to __getitem__ to handle the inner keys.
+  else:
+    starts = rt_input.row_splits[:-1]
+    limits = rt_input.row_splits[1:]
+    row = rt_input.values[starts[row_key]:limits[row_key]]
+    return row.__getitem__(inner_keys)
+
+
+def _slice_ragged_row_dimension(rt_input, row_key):
+  """Slice the outer dimension of `rt_input` according to the given `slice`.
+
+  Args:
+    rt_input: The `RaggedTensor` to slice.
+    row_key: The `slice` object that should be used to slice `rt_input`.
+
+  Returns:
+    A `RaggedTensor` containing the indicated slice of `rt_input`.
+  """
+  if row_key.start is None and row_key.stop is None and row_key.step is None:
+    return rt_input
+
+  # Use row_key to slice the starts & limits.
+  new_starts = rt_input.row_splits[:-1][row_key]
+  new_limits = rt_input.row_splits[1:][row_key]
+  zero_pad = array_ops.zeros([1], dtypes.int64)
+
+  # If there's no slice step, then we can just select a single continuous
+  # span of `ragged.values(rt_input)`.
+  if row_key.step is None or row_key.step == 1:
+    # Construct the new splits.  If new_starts and new_limits are empty,
+    # then this reduces to [0].  Otherwise, this reduces to:
+    #   concat([[new_starts[0]], new_limits])
+    new_splits = array_ops.concat(
+        [zero_pad[array_ops.size(new_starts):], new_starts[:1], new_limits],
+        axis=0)
+    values_start = new_splits[0]
+    values_limit = new_splits[-1]
+    return ragged_factory_ops.from_row_splits(
+        rt_input.values[values_start:values_limit], new_splits - values_start)
+
+  # If there is a slice step (aka a strided slice), then use ragged_gather to
+  # collect the necessary elements of `ragged.values(rt_input)`.
+  else:
+    return _build_ragged_tensor_from_value_ranges(new_starts, new_limits, 1,
+                                                  rt_input.values)
+
+
+def _ragged_getitem_inner_dimensions(rt_input, key_list):
+  """Retrieve inner dimensions, keeping outermost dimension unchanged.
+
+  Args:
+    rt_input: The `RaggedTensor` or `Tensor` from which a piece should be
+      extracted.
+    key_list: The __getitem__ keys for slicing the inner dimensions.
+
+  Returns:
+    A `RaggedTensor`.
+
+  Raises:
+    ValueError: If key_list is not supported.
+  """
+  if not key_list:
+    return rt_input
+
+  if isinstance(rt_input, ops.Tensor):
+    return rt_input.__getitem__([slice(None, None, None)] + key_list)
+
+  column_key = key_list[0]
+  if column_key is Ellipsis:
+    expanded_key_list = _expand_ellipsis(key_list, rt_input.values.shape.ndims)
+    return _ragged_getitem_inner_dimensions(rt_input, expanded_key_list)
+
+  # Adding a new axis to a ragged inner dimension: recursively get the inner
+  # dimensions of rt_input with key_list[1:], and then wrap the result in a
+  # RaggedTensor that puts each value in its own row.
+  if column_key is array_ops.newaxis:
+    inner_rt = _ragged_getitem_inner_dimensions(rt_input, key_list[1:])
+    nsplits = array_ops.shape(inner_rt.row_splits, out_type=dtypes.int64)[0]
+    return ragged_factory_ops.from_row_splits(inner_rt, math_ops.range(nsplits))
+
+  # Slicing a range of columns in a ragged inner dimension.  We use a
+  # recursive call to process the values, and then assemble a RaggedTensor
+  # with those values.
+  if isinstance(column_key, slice):
+    if (column_key.start is None and column_key.stop is None and
+        column_key.step is None):
+      # Trivial slice: recursively process all values, & splits is unchanged.
+      return rt_input.with_values(
+          _ragged_getitem_inner_dimensions(rt_input.values, key_list[1:]))
+    else:
+      # Nontrivial slice: use ragged_gather to extract the indicated slice as
+      # a new RaggedTensor (inner_rt), and then recursively process its values.
+      # The splits can be taken from ragged.row_splits(inner_rt).
+      inner_rt_starts = rt_input.row_splits[:-1]
+      inner_rt_limits = rt_input.row_splits[1:]
+      if column_key.start is not None and column_key.start != 0:
+        inner_rt_starts = _add_offset_to_ranges(
+            column_key.start, rt_input.row_splits[:-1], rt_input.row_splits[1:])
+      if column_key.stop is not None and column_key.stop != 0:
+        inner_rt_limits = _add_offset_to_ranges(
+            column_key.stop, rt_input.row_splits[:-1], rt_input.row_splits[1:])
+      inner_rt = _build_ragged_tensor_from_value_ranges(
+          inner_rt_starts, inner_rt_limits, column_key.step, rt_input.values)
+      return inner_rt.with_values(
+          _ragged_getitem_inner_dimensions(inner_rt.values, key_list[1:]))
+
+  # Indexing a single column in a ragged inner dimension: raise an Exception.
+  # See RaggedTensor.__getitem__.__doc__ for an explanation of why indexing
+  # into a ragged inner dimension is problematic.
+  else:
+    raise ValueError("Cannot index into an inner ragged dimension.")
+
+
+def _expand_ellipsis(key_list, num_remaining_dims):
+  """Expands the ellipsis at the start of `key_list`.
+
+  Assumes that the first element of `key_list` is Ellipsis.  This will either
+  remove the Ellipsis (if it corresponds to zero indices) or prepend a new
+  `slice(None, None, None)` (if it corresponds to more than zero indices).
+
+  Args:
+    key_list: The arguments to `__getitem__()`.
+    num_remaining_dims: The number of dimensions remaining.
+
+  Returns:
+    A copy of `key_list` with he ellipsis expanded.
+  Raises:
+    ValueError: If ragged_rank.shape.ndims is None
+    IndexError: If there are too many elements in `key_list`.
+  """
+  if num_remaining_dims is None:
+    raise ValueError("Ellipsis not supported for unknown shape RaggedTensors")
+  num_indices = sum(1 for idx in key_list if idx is not array_ops.newaxis)
+  if num_indices > num_remaining_dims + 1:
+    raise IndexError("Too many indices for RaggedTensor")
+  elif num_indices == num_remaining_dims + 1:
+    return key_list[1:]
+  else:
+    return [slice(None, None, None)] + key_list
+
+
+def _tensors_in_key_list(key_list):
+  """Generates all Tensors in the given slice spec."""
+  if isinstance(key_list, ops.Tensor):
+    yield key_list
+  if isinstance(key_list, (list, tuple)):
+    for v in key_list:
+      for tensor in _tensors_in_key_list(v):
+        yield tensor
+  if isinstance(key_list, slice):
+    for tensor in _tensors_in_key_list(key_list.start):
+      yield tensor
+    for tensor in _tensors_in_key_list(key_list.stop):
+      yield tensor
+    for tensor in _tensors_in_key_list(key_list.step):
+      yield tensor
+
+
+def _build_ragged_tensor_from_value_ranges(starts, limits, step, values):
+  """Returns a `RaggedTensor` containing the specified sequences of values.
+
+  Returns a RaggedTensor `output` where:
+
+  ```python
+  output.shape[0] = starts.shape[0]
+  output[i] = values[starts[i]:limits[i]:step]
+  ```
+
+  Requires that `starts.shape == limits.shape` and
+  `0 <= starts[i] <= limits[i] <= values.shape[0]`.
+
+  Args:
+    starts: 1D integer Tensor specifying the start indices for the sequences of
+      values to include.
+    limits: 1D integer Tensor specifying the limit indices for the sequences of
+      values to include.
+    step: Integer value specifying the step size for strided slices.
+    values: The set of values to select from.
+
+  Returns:
+    A `RaggedTensor`.
+
+  Raises:
+    ValueError: Until the prerequisite ops are checked in.
+  """
+  # Use `ragged_range` to get the index of each value we should include.
+  if step is None:
+    step = 1
+  step = ops.convert_to_tensor(step, name="step")
+  if step.dtype.is_integer:
+    step = math_ops.cast(step, dtypes.int64)
+  else:
+    raise TypeError("slice strides must be integers or None")
+  value_indices = ragged_math_ops.range(starts, limits, step)
+
+  # Use `ragged_gather` or `array_ops.gather` to collect the values.
+  if isinstance(values, ragged_tensor.RaggedTensor):
+    gathered_values = ragged_array_ops.gather(
+        params=values, indices=value_indices.values)
+  else:
+    gathered_values = array_ops.gather(
+        params=values, indices=value_indices.values)
+
+  # Assemble the RaggedTensor from splits & values.
+  return value_indices.with_values(gathered_values)
+
+
+def _add_offset_to_ranges(offset, starts, limits):
+  """Adds an indexing offset to each of the specified ranges.
+
+  If offset>=0, then return output[i]=min(starts[i]+offset, limits[i])
+  If offset<0, then return output[i]=max(limits[i]+offset, starts[i])
+
+  Args:
+    offset: The offset to add.  None, or an int, or a scalar Tensor.
+    starts: 1-D int64 tensor containing start indices.
+    limits: 1-D int64 tensor containing limit indices.
+
+  Returns:
+    A 1-D int64 tensor.
+  """
+
+  def map_positive_offset(offset):
+    return math_ops.minimum(starts + offset, limits)
+
+  def map_negative_offset(offset):
+    return math_ops.maximum(limits + offset, starts)
+
+  if isinstance(offset, ops.Tensor):
+    offset = math_ops.cast(offset, dtypes.int64)
+    return control_flow_ops.cond(offset >= 0,
+                                 lambda: map_positive_offset(offset),
+                                 lambda: map_negative_offset(offset))
+  elif isinstance(offset, int):
+    return (map_positive_offset(offset)
+            if offset > 0 else map_negative_offset(offset))
+
+  else:
+    raise TypeError("slice offsets must be integers or None")
diff --git a/tensorflow/python/ops/ragged/ragged_map_fn_op_test.py b/tensorflow/python/ops/ragged/ragged_map_fn_op_test.py
new file mode 100644
index 00000000000..6f3f33b4441
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_map_fn_op_test.py
@@ -0,0 +1,283 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for ragged.map_fn."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
+from tensorflow.python.keras import backend
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops as mo
+from tensorflow.python.ops import ragged
+from tensorflow.python.ops import string_ops
+from tensorflow.python.platform import googletest
+
+
+class RaggedMapOpTest(test_util.TensorFlowTestCase, parameterized.TestCase):
+  @parameterized.parameters([
+      # The following test sets map over a RaggedTensor and apply a
+      # transformation that returns with shape:
+      # [d1, (d2)] -> [d1]
+      dict(
+          fn=mo.reduce_mean,
+          elems=[[1, 2, 3], [4, 5], [6, 7]],
+          expected_output=[2, 4, 6],
+      ),
+      dict(
+          fn=string_ops.reduce_join,
+          elems=[['foo', 'bar', 'baz'], ['a'], ['b', 'c']],
+          expected_output=[b'foobarbaz', b'a', b'bc'],
+          dtype=dtypes.string,
+      ),
+      # [d1, (d2)] -> [d1, 2]
+      dict(
+          fn=lambda x: array_ops.stack([mo.reduce_mean(x), mo.reduce_sum(x)]),
+          # fn=self.stack_mean_and_sum,
+          elems=[[1, 2, 3], [4, 5], [6, 7]],
+          expected_output=[[2, 6], [4.5, 9], [6.5, 13]],
+          dtype=dtypes.float32,
+      ),
+      # [d1, (d2)] -> [d1, (d2)]
+      dict(
+          fn=lambda x: x+1,
+          elems=[[1, 2, 3], [4, 5], [6, 7]],
+          expected_output=[[2, 3, 4], [5, 6], [7, 8]],
+          dtype=dtypes.int64,
+          result_dtype=ragged.RaggedTensorType(dtype=dtypes.int64,
+                                               ragged_rank=1),
+      ),
+      # [d1, (d2), d3] -> [d1, (d2), d3]
+      dict(
+          fn=lambda x: x+1,
+          elems=[[[1, 2], [3, 4]], [], [[5, 6], [7, 8], [9, 0]]],
+          elems_ragged_rank=1,
+          expected_ragged_rank=1,
+          result_dtype=ragged.RaggedTensorType(dtype=dtypes.int64,
+                                               ragged_rank=1),
+          expected_output=[[[2, 3], [4, 5]], [], [[6, 7], [8, 9], [10, 1]]],
+      ),
+      # [d1, (d2)] -> [d1, (d2), (d3)]
+      dict(
+          fn=lambda x: ragged.from_row_starts(x, [0]),
+          elems=[[1, 2, 3], [4, 5], [6, 7]],
+          expected_output=[[[1, 2, 3]], [[4, 5]], [[6, 7]]],
+          result_dtype=ragged.RaggedTensorType(dtype=dtypes.int64,
+                                               ragged_rank=2),
+      ),
+      # [d1, (d2), (d3)] -> [d1, (d2), (d3)]
+      dict(
+          fn=lambda x: ragged.map_inner_values(mo.add, x, 1),
+          elems=[[[1, 2, 3]], [[4, 5], [6, 7]]],
+          expected_output=[[[2, 3, 4]], [[5, 6], [7, 8]]],
+          result_dtype=ragged.RaggedTensorType(dtype=dtypes.int64,
+                                               ragged_rank=2),
+      ),
+      # [d1, (d2), (d3)] -> [d1, (d2)]
+      dict(
+          fn=lambda x: ragged.reduce_sum(x, axis=1),
+          elems=[[[1, 2, 3]], [[4, 5], [6, 7]]],
+          expected_output=[[6], [9, 13]],
+          result_dtype=ragged.RaggedTensorType(dtype=dtypes.int64,
+                                               ragged_rank=1),
+      ),
+      # [d1, (d2), (d3)] -> [d1, (d3)]
+      dict(
+          fn=lambda x: ragged.reduce_sum(x, axis=0),
+          elems=[[[1, 2, 3]], [[4, 5], [6, 7]]],
+          expected_output=[[1, 2, 3], [10, 12]],
+          result_dtype=ragged.RaggedTensorType(dtype=dtypes.int64,
+                                               ragged_rank=1),
+      ),
+      # [d1, (d2), (d3)] -> [d1]
+      dict(
+          fn=ragged.reduce_sum,
+          elems=[[[1, 2, 3]], [[4, 5], [6, 7]]],
+          expected_output=[6, 22],
+          result_dtype=dtypes.int64,
+      ),
+      # [d1] -> [d1, (d2)]
+      dict(
+          fn=mo.range,
+          elems=[4, 0, 2],
+          expected_output=[[0, 1, 2, 3], [], [0, 1]],
+          result_dtype=ragged.RaggedTensorType(dtype=dtypes.int64,
+                                               ragged_rank=1),
+      ),
+      # [d1] -> [d1, (d2), (d3)]
+      dict(
+          fn=lambda x: ragged.range(mo.range(x)),
+          elems=[5, 0, 3],
+          expected_output=[
+              [[], [0], [0, 1], [0, 1, 2], [0, 1, 2, 3]], [], [[], [0], [0, 1]]
+          ],
+          result_dtype=ragged.RaggedTensorType(dtype=dtypes.int64,
+                                               ragged_rank=2),
+      ),
+      # [d1, (d2), (d3), (d4a), (d5)] ->  [d1, (d2), (d3), (d4b), (d5)]
+      dict(
+          fn=lambda x: ragged.add(x, 1),
+          elems=[[[[[1, 2, 3]], [[4], [5]]]], [[[[6, 7]]], [[[8], []]]]],
+          expected_output=[[[[[2, 3, 4]], [[5], [6]]]],
+                           [[[[7, 8]]], [[[9], []]]]],
+          result_dtype=ragged.RaggedTensorType(dtype=dtypes.int64,
+                                               ragged_rank=4),
+      ),
+  ])
+
+  def testRaggedMap(
+      self,
+      fn,
+      elems,
+      expected_output,
+      expected_ragged_rank=None,
+      result_ragged_rank=None,
+      elems_ragged_rank=None,
+      dtype=dtypes.int64,
+      result_dtype=None,
+      infer_shape=False,
+  ):
+    elems = ragged.constant(elems, dtype, elems_ragged_rank)
+    output = ragged.map_fn(
+        fn=fn, elems=elems, dtype=result_dtype, infer_shape=infer_shape)
+
+    expected_rt = ragged.constant(
+        expected_output, ragged_rank=expected_ragged_rank)
+    with self.test_session():
+      if ragged.is_ragged(expected_output):
+        self.assertEqual(output.ragged_rank, expected_rt.ragged_rank)
+      output_values = output.eval()
+      self.assertAllEqual(expected_output, output_values.tolist())
+
+  def testRaggedMapOnStructure(self):
+    batman = ragged.constant([[1, 2, 3], [4], [5, 6, 7]])
+    # [[10, 20, 30], [40], [50, 60, 70]]
+    robin = ragged.map_inner_values(mo.multiply, batman, 10)
+
+    features = {'batman': batman, 'robin': robin}
+
+    def _reduce_sum_from_all(f):
+      return mo.reduce_sum(f['batman']) + mo.reduce_sum(f['robin'])
+
+    output = ragged.map_fn(
+        fn=_reduce_sum_from_all,
+        elems=features,
+        dtype=dtypes.int32,
+    )
+
+    with self.test_session():
+      self.assertAllEqual(output.eval().tolist(), [66, 44, 198])
+
+  # Test mapping over a dict of RTs can produce a dict of RTs.
+  def testRaggedMapOnStructure_RaggedOutputs(self):
+    batman = ragged.constant([[1, 2, 3], [4], [5, 6, 7]])
+    # [[10, 20, 30], [40], [50, 60, 70]]
+    robin = ragged.map_inner_values(mo.multiply, batman, 10)
+
+    features = {'batman': batman, 'robin': robin}
+
+    def _increment(f):
+      return {
+          'batman': ragged.add(f['batman'], 1),
+          'robin': ragged.add(f['robin'], 1),
+      }
+
+    output = ragged.map_fn(
+        fn=_increment,
+        elems=features,
+        infer_shape=False,
+        dtype={
+            'batman':
+                ragged.RaggedTensorType(dtype=dtypes.int32, ragged_rank=1),
+            'robin':
+                ragged.RaggedTensorType(dtype=dtypes.int32, ragged_rank=1)
+        },
+    )
+
+    with self.test_session():
+      self.assertAllEqual(output['batman'].eval().tolist(),
+                          [[2, 3, 4], [5], [6, 7, 8]])
+      self.assertAllEqual(output['robin'].eval().tolist(),
+                          [[11, 21, 31], [41], [51, 61, 71]])
+
+  def testZip(self):
+    x = ragged.constant([[10, 20], [30, 40], [50, 60], [70], [80, 90, 100]],
+                        dtypes.int64)
+    y = array_ops.expand_dims(
+        mo.range(ragged.nrows(x), dtype=dtypes.int64), axis=1)
+
+    def _zip(foo):
+      y_val, x_val = foo
+      bar = backend.tile(y_val, array_ops.shape(x_val))
+      return array_ops.stack([bar, x_val], axis=1)
+
+    output = ragged.map_fn(
+        _zip, (y, x),
+        dtype=ragged.RaggedTensorType(dtype=dtypes.int64, ragged_rank=1),
+        infer_shape=False)
+
+    with self.test_session():
+      result = output.eval().tolist()
+      self.assertAllEqual(
+          result, [[[0, 10], [0, 20]], [[1, 30], [1, 40]], [[2, 50], [2, 60]],
+                   [[3, 70]], [[4, 80], [4, 90], [4, 100]]])
+
+  def testBatchGather(self):
+    tokens = ragged.constant([['hello', '.', 'there'], ['merhaba'],
+                              ['bonjour', '.', 'ca va', '?']])
+    indices = ragged.constant([[0, 2], [0], [0, 2]])
+
+    def gather(x):
+      tokens_val, indices_val = x
+      return array_ops.gather(tokens_val, indices_val)
+
+    data = tokens, indices
+    out = ragged.map_fn(
+        gather,
+        data,
+        dtype=ragged.RaggedTensorType(dtype=dtypes.string, ragged_rank=1),
+        infer_shape=False)
+
+    with self.test_session():
+      self.assertAllEqual(
+          out.eval().tolist(),
+          [[b'hello', b'there'], [b'merhaba'], [b'bonjour', b'ca va']])
+
+  def testMismatchRaggedRank(self):
+    elems = ragged.constant([[[1, 2, 3]], [[4, 5], [6, 7]]])
+    fn = lambda x: ragged.reduce_sum(x, axis=0)
+    with self.assertRaisesWithLiteralMatch(
+        ValueError, r'The declared ragged rank (23) mismatches the result (1)'):
+      _ = ragged.map_fn(
+          fn,
+          elems,
+          dtype=ragged.RaggedTensorType(dtype=dtypes.int64, ragged_rank=23))
+
+  def testMismatchRaggedRank2(self):
+    elems = ragged.constant([[1, 2, 3], [4, 5], [6, 7]])
+    fn = lambda x: ragged.from_row_starts(x, [0])
+    with self.assertRaisesWithLiteralMatch(
+        ValueError, r'The declared ragged rank (10) mismatches the result (1)'):
+      _ = ragged.map_fn(
+          fn,
+          elems,
+          dtype=ragged.RaggedTensorType(dtype=dtypes.int64, ragged_rank=10))
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/python/ops/ragged/ragged_map_inner_values_op_test.py b/tensorflow/python/ops/ragged/ragged_map_inner_values_op_test.py
new file mode 100644
index 00000000000..798d7c3ce81
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_map_inner_values_op_test.py
@@ -0,0 +1,210 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for ragged.map_inner_values."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import ragged
+from tensorflow.python.platform import googletest
+
+
+class RaggedMapInnerValuesOpTest(test_util.TensorFlowTestCase,
+                                 parameterized.TestCase):
+
+  def assertRaggedMapInnerValuesReturns(self,
+                                        op,
+                                        expected,
+                                        args=(),
+                                        kwargs=None):
+    kwargs = kwargs or {}
+    result = ragged.map_inner_values(op, *args, **kwargs)
+    with self.test_session():
+      self.assertEqual(result.eval().tolist(), expected)
+
+  def testDocStringExamples(self):
+    """Test the examples in apply_op_to_ragged_values.__doc__."""
+    rt = ragged.constant([[1, 2, 3], [], [4, 5], [6]])
+    v1 = ragged.map_inner_values(array_ops.ones_like, rt)
+    v2 = ragged.map_inner_values(math_ops.multiply, rt, rt)
+    v3 = ragged.map_inner_values(math_ops.add, rt, 5)
+    with self.test_session():
+      self.assertEqual(v1.eval().tolist(), [[1, 1, 1], [], [1, 1], [1]])
+      self.assertEqual(v2.eval().tolist(), [[1, 4, 9], [], [16, 25], [36]])
+      self.assertEqual(v3.eval().tolist(), [[6, 7, 8], [], [9, 10], [11]])
+
+  def testOpWithSingleRaggedTensorArg(self):
+    tensor = ragged.constant([[1, 2, 3], [], [4, 5]])
+    self.assertRaggedMapInnerValuesReturns(
+        op=array_ops.zeros_like,
+        args=(tensor,),
+        expected=[[0, 0, 0], [], [0, 0]])
+
+  def testOpWithTwoRaggedTensorArgs(self):
+    x = ragged.constant([[3, 1, 4], [], [1, 5]])
+    y = ragged.constant([[1, 2, 3], [], [4, 5]])
+    self.assertRaggedMapInnerValuesReturns(
+        op=math_ops.multiply, args=(x, y), expected=[[3, 2, 12], [], [4, 25]])
+
+  def testOpWithRaggedTensorAndScalarArgs(self):
+    y = ragged.constant([[1, 2, 3], [], [4, 5]])
+    self.assertRaggedMapInnerValuesReturns(
+        op=math_ops.multiply, args=(5, y), expected=[[5, 10, 15], [], [20, 25]])
+
+  def testOpWithThreeRaggedTensorArgs(self):
+    condition = ragged.constant(
+        [[True, True, False], [], [True, False]])  # pyformat: disable
+    x = ragged.constant([['a', 'b', 'c'], [], ['d', 'e']])
+    y = ragged.constant([['A', 'B', 'C'], [], ['D', 'E']])
+    self.assertRaggedMapInnerValuesReturns(
+        op=array_ops.where,
+        args=(condition, x, y),
+        expected=[[b'a', b'b', b'C'], [], [b'd', b'E']])
+
+  def testOpWithRaggedTensorListArg(self):
+    x = ragged.constant([[1, 2, 3], [], [4, 5]])
+    y = ragged.constant([[10, 20, 30], [], [40, 50]])
+    self.assertRaggedMapInnerValuesReturns(
+        op=math_ops.add_n,
+        args=([x, y, x],),
+        expected=[[12, 24, 36], [], [48, 60]])
+
+  def testOpWithKeywordArgs(self):
+    x = ragged.constant([[3, 1, 4], [], [1, 5]])
+    y = ragged.constant([[1, 2, 3], [], [4, 5]])
+    self.assertRaggedMapInnerValuesReturns(
+        op=math_ops.multiply,
+        kwargs=dict(x=x, y=y),
+        expected=[[3, 2, 12], [], [4, 25]])
+
+  def testOpWithMixedPositionalAndKeywordArgs(self):
+    x = ragged.constant([[3, 1, 4], [], [1, 5]])
+    y = ragged.constant([[1, 2, 3], [], [4, 5]])
+    self.assertRaggedMapInnerValuesReturns(
+        op=math_ops.multiply,
+        args=(x,),
+        kwargs=dict(y=y),
+        expected=[[3, 2, 12], [], [4, 25]])
+
+  def testNonElementWiseOp(self):
+    x = ragged.constant(
+        [[[3, 1, 4], [1, 5, 9], [2, 6, 5]], [], [[3, 5, 8], [9, 7, 9]]],
+        ragged_rank=1)
+    self.assertRaggedMapInnerValuesReturns(
+        op=math_ops.reduce_sum,
+        kwargs={
+            'input_tensor': x,
+            'axis': 1,
+        },
+        expected=[[8, 15, 13], [], [16, 25]])
+
+  def testOpWithRaggedRankGreaterThanOne(self):
+    # ragged_rank=0
+    x0 = [3, 1, 4, 1, 5, 9, 2, 6, 5]
+    y0 = [1, 2, 3, 4, 5, 6, 7, 8, 9]
+    with self.test_session():
+      self.assertEqual(
+          math_ops.multiply(x0, y0).eval().tolist(),
+          [3, 2, 12, 4, 25, 54, 14, 48, 45])
+
+    # ragged_rank=1
+    x1 = ragged.constant([[3, 1, 4], [], [1, 5], [9, 2], [6, 5]])
+    y1 = ragged.constant([[1, 2, 3], [], [4, 5], [6, 7], [8, 9]])
+    self.assertRaggedMapInnerValuesReturns(
+        op=math_ops.multiply,
+        args=(x1, y1),
+        expected=[[3, 2, 12], [], [4, 25], [54, 14], [48, 45]])
+
+    # ragged_rank=2
+    x2 = ragged.constant([[[3, 1, 4]], [], [[], [1, 5]], [[9, 2], [6, 5]]])
+    y2 = ragged.constant([[[1, 2, 3]], [], [[], [4, 5]], [[6, 7], [8, 9]]])
+    self.assertRaggedMapInnerValuesReturns(
+        op=math_ops.multiply,
+        args=(x2, y2),
+        expected=[[[3, 2, 12]],          # row 0
+                  [],                    # row 1
+                  [[], [4, 25]],         # row 2
+                  [[54, 14], [48, 45]]   # row 3
+                 ])  # pyformat: disable
+
+    # ragged_rank=3
+    x3 = ragged.constant([[[[3, 1, 4]], []], [], [[[], [1, 5]]],
+                          [[[9, 2], [6, 5]]]])
+    y3 = ragged.constant([[[[1, 2, 3]], []], [], [[[], [4, 5]]],
+                          [[[6, 7], [8, 9]]]])
+    self.assertRaggedMapInnerValuesReturns(
+        op=math_ops.multiply,
+        args=(x3, y3),
+        expected=[
+            [[[3, 2, 12]], []],       # row 0
+            [],                       # row 1
+            [[[], [4, 25]]],          # row 2
+            [[[54, 14], [48, 45]]]    # row 3
+        ])  # pyformat: disable
+
+  def testOpWithRaggedRankThree(self):
+    x = ragged.constant([[[3, 1, 4]], [], [[], [1, 5]]])
+    y = ragged.constant([[[1, 2, 3]], [], [[], [4, 5]]])
+    self.assertRaggedMapInnerValuesReturns(
+        op=math_ops.multiply,
+        args=(x, y),
+        expected=[[[3, 2, 12]], [], [[], [4, 25]]])
+
+  def testOpWithInnerValuesOnly(self):
+    x = constant_op.constant([[1, 2], [3, 4], [5, 6]])
+    y = constant_op.constant(2)
+    self.assertRaggedMapInnerValuesReturns(
+        op=math_ops.multiply, args=(x, y), expected=[[2, 4], [6, 8], [10, 12]])
+
+  def testRaggedTensorSplitsRaggedRankMismatchError(self):
+    x = ragged.constant([[3, 1, 4], [], [1, 5]])
+    y = ragged.constant([[[3, 1, 4], []], [], [[1, 5]]])
+    self.assertRaisesRegexp(ValueError,
+                            r'Inputs must have identical ragged splits.*',
+                            ragged.map_inner_values, math_ops.add, x, y)
+
+  def testRaggedTensorSplitsValueMismatchError(self):
+    x = ragged.constant([[3, 1, 4], [], [1, 5]])
+    y = ragged.constant([[1], [2, 3], [4, 5]])
+    self.assertRaisesRegexp(errors.InvalidArgumentError,
+                            r'Inputs must have identical ragged splits.*',
+                            ragged.map_inner_values, math_ops.add, x, y)
+
+  def testRaggedTensorSplitsMismatchErrorAtRuntime(self):
+    splits1 = array_ops.placeholder_with_default(
+        constant_op.constant([0, 3, 3, 5], dtypes.int64), None)
+    splits2 = array_ops.placeholder_with_default(
+        constant_op.constant([0, 1, 3, 5], dtypes.int64), None)
+    x = ragged.from_row_splits([3, 1, 4, 1, 5], splits1)
+    y = ragged.from_row_splits([1, 2, 3, 4, 5], splits2)
+    result = ragged.map_inner_values(math_ops.add, x, y)
+    with self.test_session():
+      self.assertRaisesRegexp(
+          errors.InvalidArgumentError,
+          r'\[Inputs must have identical ragged splits\] '
+          r'\[Condition x == y did not hold element-wise:\].*', result.eval)
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/python/ops/ragged/ragged_map_ops.py b/tensorflow/python/ops/ragged/ragged_map_ops.py
new file mode 100644
index 00000000000..fafa23b8dcb
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_map_ops.py
@@ -0,0 +1,446 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Functional operations.
+
+See the [Higher Order
+Functions](https://tensorflow.org/api_guides/python/functional_ops) guide.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import collections
+from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import tensor_array_ops
+from tensorflow.python.ops import variable_scope as vs
+from tensorflow.python.ops.ragged import ragged_array_ops
+from tensorflow.python.ops.ragged import ragged_factory_ops
+from tensorflow.python.ops.ragged import ragged_tensor
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util import nest
+
+
+def map_fn(fn,
+           elems,
+           dtype=None,
+           parallel_iterations=None,
+           back_prop=True,
+           swap_memory=False,
+           infer_shape=True,
+           name=None):
+  """map on the list of tensors unpacked from `elems` on dimension 0.
+
+  The simplest version of `map_fn` repeatedly applies the callable `fn` to a
+  sequence of elements from first to last. The elements are made of the
+  tensors unpacked from `elems`. `dtype` is the data type of the return
+  value of `fn`. Users must provide `dtype` if it is different from
+  the data type of `elems`.
+
+  Suppose that `elems` is unpacked into `values`, a list of tensors. The shape
+  of the result tensor is `[values.shape[0]] + fn(values[0]).shape`.
+
+  This method also allows multi-arity `elems` and output of `fn`.  If `elems`
+  is a (possibly nested) list or tuple of tensors, then each of these tensors
+  must have a matching first (unpack) dimension.  The signature of `fn` may
+  match the structure of `elems`.  That is, if `elems` is
+  `(t1, [t2, t3, [t4, t5]])`, then an appropriate signature for `fn` is:
+  `fn = lambda (t1, [t2, t3, [t4, t5]]):`.
+
+  Furthermore, `fn` may emit a different structure than its input.  For example,
+  `fn` may look like: `fn = lambda t1: return (t1 + 1, t1 - 1)`.  In this case,
+  the `dtype` parameter is not optional: `dtype` must be a type or (possibly
+  nested) tuple of types matching the output of `fn`.
+
+  To apply a functional operation to the nonzero elements of a SparseTensor
+  one of the following methods is recommended. First, if the function is
+  expressible as TensorFlow ops, use
+
+  ```python
+    result = SparseTensor(input.indices, fn(input.values), input.dense_shape)
+  ```
+
+  If, however, the function is not expressible as a TensorFlow op, then use
+
+  ```python
+  result = SparseTensor(
+    input.indices, map_fn(fn, input.values), input.dense_shape)
+  ```
+
+  instead.
+
+  When executing eagerly, map_fn does not execute in parallel even if
+  `parallel_iterations` is set to a value > 1. You can still get the
+  performance benefits of running a function in parallel by using the
+  `tf.contrib.eager.defun` decorator,
+
+  ```python
+  # Assume the function being used in map_fn is fn.
+  # To ensure map_fn calls fn in parallel, use the defun decorator.
+  @tf.contrib.eager.defun
+  def func(tensor):
+    return tf.map_fn(fn, tensor)
+  ```
+
+  Note that if you use the defun decorator, any non-TensorFlow Python code
+  that you may have written in your function won't get executed. See
+  `tf.contrib.eager.defun` for more details. The recommendation would be to
+  debug without defun but switch to defun to get performance benefits of
+  running map_fn in parallel.
+
+  Args:
+    fn: The callable to be performed.  It accepts one argument, which will have
+      the same (possibly nested) structure as `elems`.  Its output must have the
+      same structure as `dtype` if one is provided, otherwise it must have the
+      same structure as `elems`.
+    elems: A tensor or (possibly nested) sequence of tensors, each of which will
+      be unpacked along their first dimension.  The nested sequence of the
+      resulting slices will be applied to `fn`.
+    dtype: (optional) The output type(s) of `fn`.  If `fn` returns a structure
+      of Tensors differing from the structure of `elems`, then `dtype` is not
+      optional and must have the same structure as the output of `fn`. Use
+      `RaggedTensorType` to declare an output of type `RaggedTensor`.
+    parallel_iterations: (optional) The number of iterations allowed to run in
+      parallel. When graph building, the default value is 10. While executing
+      eagerly, the default value is set to 1.
+    back_prop: (optional) True enables support for back propagation.
+    swap_memory: (optional) True enables GPU-CPU memory swapping.
+    infer_shape: (optional) False disables tests for consistent output shapes.
+    name: (optional) Name prefix for the returned tensors.
+
+  Returns:
+    A possibly nested sequence of potentially ragged tensors.  Each
+    tensor packs the results of applying `fn` to tensors unpacked from `elems`
+    along the first dimension, from first to last.
+
+  Raises:
+    TypeError: if `fn` is not callable or the structure of the output of
+      `fn` and `dtype` do not match, or if elems is a SparseTensor.
+    ValueError: if the lengths of the output of `fn` and `dtype` do not match.
+
+  #### Examples:
+
+    ```python
+    elems = np.array([1, 2, 3, 4, 5, 6])
+    squares = map_fn(lambda x: x * x, elems)
+    # squares == [1, 4, 9, 16, 25, 36]
+    ```
+
+    ```python
+    elems = (np.array([1, 2, 3]), np.array([-1, 1, -1]))
+    alternate = map_fn(lambda x: x[0] * x[1], elems, dtype=tf.int64)
+    # alternate == [-1, 2, -3]
+    ```
+
+    ```python
+    elems = np.array([1, 2, 3])
+    alternates = map_fn(lambda x: (x, -x), elems, dtype=(tf.int64, tf.int64))
+    # alternates[0] == [1, 2, 3]
+    # alternates[1] == [-1, -2, -3]
+    ```
+
+    ```python
+    elems=ragged.constant([[1, 2, 3], [4, 5], [6, 7]])
+    mean = map_fn(tf.reduce_mean, elems)
+    # mean == [2, 4, 6]
+    ```
+
+    ```python
+    elems=ragged.constant([[1, 2, 3], [4, 5], [6, 7]], dtype=tf.int64)
+    out = map_fn(fn=lambda x: x+1, elems,
+      dtype=ragged.RaggedTensorType(type=tf.int64, ragged_rank=0))
+    # out = ragged.constant([[2, 3, 4], [5, 6], [7, 8]])
+    ```
+  """
+  if not callable(fn):
+    raise TypeError("fn must be callable.")
+
+  if isinstance(elems, sparse_tensor.SparseTensor):
+    raise TypeError(
+        "To perform a map on the values of a sparse tensor use either "
+        " SparseTensor(input.indices, fn(input.values), input.dense_shape) or "
+        " SparseTensor(input.indices, map_fn(fn, input.values), "
+        "input.dense_shape)")
+
+  in_graph_mode = not context.executing_eagerly()
+  # Set the default number of parallel_iterations depending on graph/eager mode.
+  if in_graph_mode and not parallel_iterations:
+    parallel_iterations = 10
+  elif not in_graph_mode and not parallel_iterations:
+    parallel_iterations = 1
+
+  if not in_graph_mode and parallel_iterations > 1:
+    logging.log_first_n(logging.WARN, "Setting parallel_iterations > 1 has no "
+                        "effect when executing eagerly. Consider calling map_fn"
+                        " with tf.contrib.eager.defun to execute fn in "
+                        "parallel.", 1)
+    parallel_iterations = 1
+
+  input_is_sequence = nest.is_sequence(elems)
+  input_flatten = lambda x: nest.flatten(x) if input_is_sequence else [x]
+
+  def input_pack(x):
+    return nest.pack_sequence_as(elems, x) if input_is_sequence else x[0]
+
+  elems_flat = input_flatten(elems)
+
+  with ops.name_scope(name, "map", elems_flat):
+    # TODO(akshayka): Remove the in_graph_mode check once caching devices are
+    # supported in Eager
+    if in_graph_mode:
+      # Any get_variable calls in fn will cache the first call locally
+      # and not issue repeated network I/O requests for each iteration.
+      varscope = vs.get_variable_scope()
+      varscope_caching_device_was_none = False
+      if varscope.caching_device is None:
+        # TODO(ebrevdo): Change to using colocate_with here and in other
+        # methods.
+        varscope.set_caching_device(lambda op: op.device)
+        varscope_caching_device_was_none = True
+
+    elems_flat = [
+        ragged_factory_ops.convert_to_tensor_or_ragged_tensor(
+            elem, name="elem") for elem in elems_flat
+    ]
+
+    # We can either infer the output, or we can assume that it will be the same
+    # as the input structure.
+    dtype = dtype or input_pack([elem.dtype for elem in elems_flat])
+
+    # Find the number of iterations, n may be known statically.
+    if isinstance(elems_flat[0], ragged_tensor.RaggedTensor):
+      n = ragged_array_ops.nrows(elems_flat[0], out_type=dtypes.int32)
+    else:
+      static_shape = elems_flat[0].shape
+      if static_shape.ndims is not None and static_shape.ndims < 1:
+        if len(elems_flat) == 1:
+          raise ValueError(
+              "elems must be a 1+ dimensional Tensor, not a scalar")
+        else:
+          raise ValueError(
+              "elements in elems must be 1+ dimensional Tensors, not scalars")
+      n = static_shape[0].value or array_ops.shape(elems_flat[0])[0]
+
+    # Create a flat list of TAs.
+
+    # Flatten the dtype structure to a list.
+    dtype_flat = nest.flatten(dtype)
+
+    # decompose to components
+    dtype_components = [_maybe_decompose_dtype(d) for d in dtype_flat]
+    dtype_components_flat = nest.flatten(dtype_components)
+
+    # Create TensorArrays.
+    accs_ta = [
+        tensor_array_ops.TensorArray(
+            dtype=t, dynamic_size=False, infer_shape=infer_shape, size=n)
+        for t in dtype_components_flat
+    ]
+
+    i = constant_op.constant(0)
+
+    def compute(i, tas):
+      """The loop body of map_fn.
+
+      Args:
+        i: the loop counter
+        tas: the flat TensorArray accumulator list
+
+      Returns:
+        (i + 1, tas): the updated counter + updated TensorArrays
+
+      Raises:
+        TypeError: if dtype and packed_fn_values structure do not match
+        ValueType: if dtype and packed_fn_values lengths do not match
+      """
+      # Get Tensors or RaggedTensors sliced at i, then pack it back to the
+      # original structure.
+      packed_values = input_pack([elem_flat[i] for elem_flat in elems_flat])
+      packed_fn_values = fn(packed_values)
+
+      # Check that the structure of the output matches what was declared or
+      # inferred.
+      # nest.assert_same_structure(dtype or elems, packed_fn_values)
+
+      # Flatten and decompose to a list of Tensors
+      flat_fn_values = nest.flatten(packed_fn_values)
+
+      # If we declared that we are expecting a RaggedTensor output, but we get a
+      # Tensor output. We should try to convert it to a RaggedTensor.
+      flat_fn_composite_tensors = list(
+          _convert_declared(flat_fn_values, dtype_flat))
+
+      flat_fn_components = [
+          _maybe_decompose_tensor(t) for t in flat_fn_composite_tensors
+      ]
+      flat_fn_tensors = nest.flatten(flat_fn_components)
+
+      # Write to TAs.
+      tas = [ta.write(i, value) for (ta, value) in zip(tas, flat_fn_tensors)]
+
+      return (i + 1, tas)
+
+    _, r_a = control_flow_ops.while_loop(
+        lambda i, _: i < n, compute, (i, accs_ta),
+        parallel_iterations=parallel_iterations,
+        back_prop=back_prop,
+        swap_memory=swap_memory,
+        maximum_iterations=n)
+
+    # TODO(akshayka): Remove the in_graph_mode check once caching devices are
+    # supported in Eager
+    if in_graph_mode and varscope_caching_device_was_none:
+      varscope.set_caching_device(None)
+
+    # Pack back into a list of components
+    results_as_components = nest.pack_sequence_as(dtype_components, r_a)
+
+    # Stack TensorArrays for Tensor outputs, and concat RaggedTensor outputs.
+    def _stack_or_concat(e):
+      if isinstance(e, _RaggedTensorComponents):
+        return _concat_ragged_tensor_components(e)
+      else:
+        result = e.stack()
+        return result
+
+    results_flat_components = [
+        _stack_or_concat(e) for e in results_as_components
+    ]
+
+    results_packed = [
+        _maybe_recompose_tensor(c) for c in results_flat_components
+    ]
+    results_packed = nest.pack_sequence_as(dtype, results_packed)
+    return results_packed
+
+
+class _RaggedTensorComponents(
+    collections.namedtuple(
+        "_RaggedTensorComponents",
+        ["inner_values", "nested_row_lengths", "outer_row_length"])):
+  """A namedtuple of components which represent a `RaggedTensor`.
+
+  _RaggedTensorComponents is a list of components which can be used to create a
+  `RaggedTensor`. Use this class to represent a `RaggedTensor` in situations
+  where nest.flatten and nest.pack_sequence_as should decompose ragged tensors
+  into their components..
+
+  The following are a list of components for a `RaggedTensor`:
+
+  inner_values: The flat and inner values of a RaggedTensor. This could be
+    a `Tensor`, a `TensorArray`, or a data type.
+  nested_row_lengths: a tuple containing the row lengths of each rank. The
+    elements of the tuple could be `Tensor`s or `TensorArray`s.
+  outer_row_length: a `Tensor` or `TensorArray` containing the row length of the
+    `RaggedTensor`'s outermost dimension.
+
+  See `RaggedTensor` for more details of the use of each component.
+  """
+  __slots__ = ()
+
+
+def _concat_ragged_tensor_components(rt_ta):
+  inner_values = rt_ta.inner_values.concat()
+  nested_row_lengths = tuple(
+      row_lengths_ta.concat() for row_lengths_ta in rt_ta.nested_row_lengths)
+  outer_row_length = rt_ta.outer_row_length.concat()
+  return _RaggedTensorComponents(
+      inner_values=inner_values,
+      nested_row_lengths=nested_row_lengths,
+      outer_row_length=outer_row_length)
+
+
+def _maybe_decompose_tensor(rt):
+  """Decompose tensors to their composite tensors."""
+  if not isinstance(rt, ragged_tensor.RaggedTensor):
+    return rt
+
+  # The three component pieces we need:
+  # - inner values
+  inner_values = rt.inner_values
+
+  # - row_splits of the RT
+  splits = rt.nested_row_splits
+  nested_row_lengths = tuple(split[1:] - split[:-1] for split in splits)
+
+  # - outer row length
+  outer_row_length = array_ops.expand_dims(ragged_array_ops.nrows(rt), axis=0)
+
+  return _RaggedTensorComponents(
+      inner_values=inner_values,
+      nested_row_lengths=nested_row_lengths,
+      outer_row_length=outer_row_length,
+  )
+
+
+def _maybe_recompose_tensor(t):
+  """Reconstructs a _RaggedTensorComponents into a RaggedTensor."""
+  if not isinstance(t, _RaggedTensorComponents):
+    return t
+
+  values = t.inner_values
+  nested_row_lengths = tuple(t.nested_row_lengths)
+  for nested_row_length in reversed(nested_row_lengths):
+    values = ragged_factory_ops.from_row_lengths(values, nested_row_length)
+  return ragged_factory_ops.from_row_lengths(values, t.outer_row_length)
+
+
+def _maybe_decompose_dtype(d):
+  """Decompose dtypes into composite tensors (if necessary)."""
+  if not isinstance(d, ragged_tensor.RaggedTensorType):
+    return d
+
+  result = _RaggedTensorComponents(
+      inner_values=d.dtype,
+      nested_row_lengths=tuple(dtypes.int64 for i in range(d.ragged_rank - 1)),
+      outer_row_length=dtypes.int64,
+  )
+  return result
+
+
+def _convert_declared(fn_output_flat, output_declared):
+  """Convert outputs which are `Tensor`s into `_RaggedTensorComponents`."""
+  for current, declared in zip(fn_output_flat, output_declared):
+    if isinstance(declared, ragged_tensor.RaggedTensorType):
+      if isinstance(current, ragged_tensor.RaggedTensor):
+        # Check that the ragged ranks match up.
+        # + 1 to account for the rank of the outermost dimension.
+        if declared.ragged_rank != current.ragged_rank + 1:
+          raise ValueError(
+              "The declared ragged rank (%d) mismatches the result (%d)" %
+              (declared.ragged_rank, current.ragged_rank))
+        yield current
+      else:
+        # We the output is a Tensor, but the caller has declared that we are
+        # expecting an RaggedTensor output.
+        if declared.ragged_rank != 1:
+          raise ValueError(
+              "The declared ragged rank (%d) mismatches the result (1)" %
+              declared.ragged_rank)
+
+        row_length = array_ops.expand_dims(
+            ragged_array_ops.nrows(current), axis=0)
+        rt = _RaggedTensorComponents(
+            inner_values=current,
+            nested_row_lengths=(),
+            outer_row_length=row_length)
+        yield rt
+    else:
+      yield current
diff --git a/tensorflow/python/ops/ragged/ragged_math_ops.py b/tensorflow/python/ops/ragged/ragged_math_ops.py
new file mode 100644
index 00000000000..857b8dbfa36
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_math_ops.py
@@ -0,0 +1,566 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Support for ragged tensors."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import gen_ragged_math_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.ragged import ragged_factory_ops
+from tensorflow.python.ops.ragged import ragged_functional_ops
+from tensorflow.python.ops.ragged import ragged_tensor
+from tensorflow.python.ops.ragged import ragged_util
+from tensorflow.python.ops.ragged import segment_id_ops
+
+
+#===============================================================================
+# ragged.range
+#===============================================================================
+# pylint: disable=redefined-builtin
+def range(starts, limits=None, deltas=1, dtype=None, name=None):
+  """Returns a `RaggedTensor` containing the specified sequences of numbers.
+
+  Each row of the returned `RaggedTensor` contains a single sequence:
+
+  ```python
+  ragged.range(starts, limits, deltas)[i] ==
+      tf.range(starts[i], limits[i], deltas[i])
+  ```
+
+  If `start[i] < limits[i] and deltas[i] > 0`, then `output[i]` will be an
+  empty list.  Similarly, if `start[i] > limits[i] and deltas[i] < 0`, then
+  `output[i]` will be an empty list.  This behavior is consistent with the
+  Python `range` function, but differs from the `tf.range` op, which returns
+  an error for these cases.
+
+  Examples:
+
+  ```python
+  >>> ragged.range([3, 5, 2]).eval().tolist()
+  [[0, 1, 2], [0, 1, 2, 3, 4], [0, 1]]
+  >>> ragged.range([0, 5, 8], [3, 3, 12]).eval().tolist()
+  [[0, 1, 2], [], [8, 9, 10, 11]]
+  >>> ragged.range([0, 5, 8], [3, 3, 12], 2).eval().tolist()
+  [[0, 2], [], [8, 10]]
+  ```
+
+  The input tensors `starts`, `limits`, and `deltas` may be scalars or vectors.
+  The vector inputs must all have the same size.  Scalar inputs are broadcast
+  to match the size of the vector inputs.
+
+  Args:
+    starts: Vector or scalar `Tensor`.  Specifies the first entry for each range
+      if `limits` is not `None`; otherwise, specifies the range limits, and the
+      first entries default to `0`.
+    limits: Vector or scalar `Tensor`.  Specifies the exclusive upper limits for
+      each range.
+    deltas: Vector or scalar `Tensor`.  Specifies the increment for each range.
+      Defaults to `1`.
+    dtype: The type of the elements of the resulting tensor.  If not specified,
+      then a value is chosen based on the other args.
+    name: A name for the operation.
+
+  Returns:
+    A `RaggedTensor` of type `dtype` with `ragged_rank=1`.
+  """
+  if limits is None:
+    starts, limits = 0, starts
+
+  with ops.name_scope(name, 'RaggedRange', [starts, limits, deltas]) as name:
+    starts = ops.convert_to_tensor(starts, dtype=dtype, name='starts')
+    limits = ops.convert_to_tensor(limits, dtype=dtype, name='limits')
+    deltas = ops.convert_to_tensor(deltas, dtype=dtype, name='deltas')
+
+    # infer dtype if not explicitly provided
+    if dtype is None:
+      starts, limits, deltas = _infer_matching_dtype(
+          [starts, limits, deltas],
+          [dtypes.int32, dtypes.int64, dtypes.float32, dtypes.float64])
+
+    result = gen_ragged_math_ops.ragged_range(starts, limits, deltas, name=name)
+    return ragged_factory_ops.from_row_splits(result.rt_dense_values,
+                                              result.rt_nested_splits)
+
+
+def _infer_matching_dtype(tensors, dtype_hierarchy):
+  """Infers a matching dtype for tensors, and casts them to that dtype."""
+  assert all(t.dtype in dtype_hierarchy for t in tensors)
+  inferred_dtype = max([t.dtype for t in tensors], key=dtype_hierarchy.index)
+  return [math_ops.cast(t, inferred_dtype) for t in tensors]
+
+
+#===============================================================================
+# ragged_segment_<AGGREGATE>
+#===============================================================================
+
+# Docstring template used for the raggged_segment_<AGGREGATE> ops.
+_RAGGED_SEGMENT_DOCSTRING = """\
+Computes the %(combination)s along segments of a RaggedTensor.
+
+  Returns a RaggedTensor `output` with `num_segments` rows, where the row
+  `output[i]` is formed by taking the %(combination)s of all rows of `data`
+  whose corresponding `segment_id` is `i`.
+
+  The length of the row `output[i]` will be the maximum of the lengths of
+  all rows of `data` whose corresponding `segment_id` is `i`.  If no `data`
+  rows correspond to a given segment ID, then the output row for that segment
+  ID will be empty.
+
+  Args:
+    data: A `RaggedTensor` containing the values to combine.
+    segment_ids: A `Tensor` or `RaggedTensor`.  Must have type `int64` or
+      `int32`.  `segment_ids.shape` must be a prefix of `data.shape`.
+      Must be greater than or equal to zero, and less than `num_segments`.
+      `segment_ids` is not required to be sorted.
+    num_segments: An `int32` or `int64` scalar specifying the number of
+      distinct segment ids.
+    name: A name prefix for the returned tensor (optional).
+  Returns:
+    A `RaggedTensor` containing the %(combined)s values.  The returned tensor
+    has the same dtype as `data`, and its shape is
+    `[num_segments] + data.shape[segment_ids.rank:]`.
+  Raises:
+    ValueError: If `segment_ids.shape` is not a prefix of `data.shape`.
+"""
+
+
+def _ragged_segment_aggregate(unsorted_segment_op, data, segment_ids,
+                              num_segments, name=None):
+  """Aggregates along segments of a RaggedTensor using `unsorted_segment_op`.
+
+  Returns a RaggedTensor `output` with `num_segments` rows, where the row
+  `output[i]` is formed by combining all rows of `data` whose corresponding
+  `segment_id` is `i`.  The values in each row are combined using
+  `unsorted_segment_op`.
+
+  The length of the row `output[i]` will be the maximum of the lengths of
+  all rows of `data` whose corresponding `segment_id` is `i`.  If no `data`
+  rows correspond to a given segment ID, then the output row for that segment
+  ID will be empty.
+
+  Args:
+    unsorted_segment_op: The tensorflow `op` that should be used to combine
+      values in each row.  Must have the same signature and basic behavior as
+      `unsorted_segment_sum`, `unsorted_segment_max`, etc.
+    data: A `RaggedTensor` containing the values to be combined.
+    segment_ids: A `Tensor` or `RaggedTensor`.  Must have type `int64` or
+      `int32`.  `segment_ids.shape` must be a prefix of `data.shape`.
+      `segment_ids` is not required to be sorted.
+    num_segments: An `int32` or `int64` scalar.
+    name: A name prefix for the returned tensor (optional).
+
+  Returns:
+    A `RaggedTensor` containing the aggregated values.  The returned tensor
+    has the same dtype as `data`, and its shape is
+    `[num_segments] + data.shape[segment_ids.rank:]`.
+  Raises:
+    ValueError: If segment_ids.shape is not a prefix of data.shape.
+  """
+  if not (ragged_tensor.is_ragged(data) or
+          ragged_tensor.is_ragged(segment_ids)):
+    return unsorted_segment_op(data, segment_ids, num_segments, name)
+
+  with ops.name_scope(name, 'RaggedSegment',
+                      [data, segment_ids, num_segments]) as name:
+    data = ragged_factory_ops.convert_to_tensor_or_ragged_tensor(
+        data, name='data')
+    segment_ids = ragged_factory_ops.convert_to_tensor_or_ragged_tensor(
+        segment_ids, name='segment_ids')
+
+    if ragged_tensor.is_ragged(segment_ids):
+      if not ragged_tensor.is_ragged(data):
+        raise ValueError('segment_ids.shape must be a prefix of data.shape, '
+                         'but segment_ids is ragged and data is not.')
+      check_splits = check_ops.assert_equal(
+          segment_ids.row_splits,
+          data.row_splits,
+          message='segment_ids.shape must be a prefix of data.shape')
+      with ops.control_dependencies([check_splits]):
+        return _ragged_segment_aggregate(unsorted_segment_op, data.values,
+                                         segment_ids.values, num_segments, name)
+
+    segment_ids = math_ops.cast(segment_ids, dtypes.int64)
+
+    # Find the length of each row in data.  (dtype=int64, shape=[data_nrows])
+    data_row_lengths = data.row_splits[1:] - data.row_splits[:-1]
+
+    # Find the length that each output row will have.  The length of the row
+    # corresponding to segment `id` is `max(data_row_lengths[i])` where
+    # `segment_ids[i]=id`.  (dtype=int64, shape=[output_nrows])
+    output_row_lengths = math_ops.maximum(
+        math_ops.unsorted_segment_max(data_row_lengths, segment_ids,
+                                      num_segments), 0)
+    assert output_row_lengths.dtype == dtypes.int64
+
+    # Build the splits tensor for the output RaggedTensor.
+    output_splits = array_ops.concat(
+        [
+            array_ops.zeros([1], dtypes.int64),
+            math_ops.cumsum(output_row_lengths)
+        ],
+        axis=0)
+
+    # For each row in `data`, find the start & limit position where that row's
+    # values will be aggregated in output.values.
+    data_row_to_out_row_start = array_ops.gather(output_splits, segment_ids)
+    data_row_to_out_row_limit = data_row_to_out_row_start + data_row_lengths
+
+    # For each value in `data.values`, find the position where it will
+    # aggregated in `output.values`.
+    # Get the target output values index for each data values index.
+    data_val_to_out_val_index = range(data_row_to_out_row_start,
+                                      data_row_to_out_row_limit).values
+
+    # Recursively aggregate the values.
+    output_values = _ragged_segment_aggregate(unsorted_segment_op, data.values,
+                                              data_val_to_out_val_index,
+                                              output_splits[-1])
+    return ragged_factory_ops.from_row_splits(output_values, output_splits)
+
+
+def segment_sum(data, segment_ids, num_segments, name=None):
+  # For docs, see: _RAGGED_SEGMENT_DOCSTRING
+  return _ragged_segment_aggregate(math_ops.unsorted_segment_sum, data,
+                                   segment_ids, num_segments, name or
+                                   'RaggedSegmentSum')
+
+
+def segment_prod(data, segment_ids, num_segments, name=None):
+  # For docs, see: _RAGGED_SEGMENT_DOCSTRING
+  return _ragged_segment_aggregate(math_ops.unsorted_segment_prod, data,
+                                   segment_ids, num_segments, name or
+                                   'RaggedSegmentProd')
+
+
+def segment_min(data, segment_ids, num_segments, name=None):
+  # For docs, see: _RAGGED_SEGMENT_DOCSTRING
+  return _ragged_segment_aggregate(math_ops.unsorted_segment_min, data,
+                                   segment_ids, num_segments, name or
+                                   'RaggedSegmentMin')
+
+
+def segment_max(data, segment_ids, num_segments, name=None):
+  # For docs, see: _RAGGED_SEGMENT_DOCSTRING
+  return _ragged_segment_aggregate(math_ops.unsorted_segment_max, data,
+                                   segment_ids, num_segments, name or
+                                   'RaggedSegmentMax')
+
+
+def segment_mean(data, segment_ids, num_segments, name=None):
+  # For docs, see: _RAGGED_SEGMENT_DOCSTRING
+  with ops.name_scope(name, 'RaggedSegmentMean',
+                      [data, segment_ids, num_segments]):
+    total = segment_sum(data, segment_ids, num_segments)
+    ones = ragged_factory_ops.from_nested_row_splits(
+        array_ops.ones_like(data.inner_values), data.nested_row_splits)
+    count = segment_sum(ones, segment_ids, num_segments)
+    return ragged_factory_ops.from_nested_row_splits(
+        total.inner_values / count.inner_values, total.nested_row_splits)
+
+
+def segment_sqrt_n(data, segment_ids, num_segments, name=None):
+  # For docs, see: _RAGGED_SEGMENT_DOCSTRING
+  with ops.name_scope(name, 'RaggedSegmentSqrtN',
+                      [data, segment_ids, num_segments]):
+    total = segment_sum(data, segment_ids, num_segments)
+    ones = ragged_factory_ops.from_nested_row_splits(
+        array_ops.ones_like(data.inner_values), data.nested_row_splits)
+    count = segment_sum(ones, segment_ids, num_segments)
+    return ragged_factory_ops.from_nested_row_splits(
+        total.inner_values / math_ops.sqrt(count.inner_values),
+        total.nested_row_splits)
+
+
+def _set_ragged_segment_docstring(func, combination, combined):
+  func.__doc__ = _RAGGED_SEGMENT_DOCSTRING % dict(
+      combination=combination, combined=combined)
+
+
+_set_ragged_segment_docstring(segment_sum, 'sum', 'summed')
+_set_ragged_segment_docstring(segment_prod, 'product', 'multiplied')
+_set_ragged_segment_docstring(segment_min, 'minimum', 'minimized')
+_set_ragged_segment_docstring(segment_max, 'maximum', 'maximized')
+_set_ragged_segment_docstring(segment_mean, 'mean', 'averaged')
+_set_ragged_segment_docstring(segment_sqrt_n, 'sum divided by sqrt(N)',
+                              'summed')
+
+#===============================================================================
+# ragged_reduce_<AGGREGATE>
+#===============================================================================
+
+# Docstring template used for ragged_reduce_<AGGREGATE> ops.
+_RAGGED_REDUCE_DOCSTRING = """\
+Computes the %(combination)s of elements across dimensions of a `RaggedTensor`.
+
+  Reduces `rt_input` along the dimensions given in `axis` by taking the
+  %(combination)s of values.  If a reduced dimension has no elements for
+  some index, then the value for that index will be %(default)s.
+
+  The rank of the tensor is reduced by `1` for each entry in `axis`.  If
+  `axis` is not specified, then all dimensions are reduced, and a scalar
+  value is returned.
+  Args:
+    rt_input: A `RaggedTensor` containing the values to be %(combined)s.
+    axis: The dimensions to reduce.  May be `None` (to reduce all axes), an
+      `int` (to reduce a single axis), a `list` or `tuple` of `int` (to reduce
+      a given set of axes), or a `Tensor` with a constant value.  Must be in
+      the range `[0, rt_input.rank]`.
+    name: A name prefix for the returned tensor (optional).
+  Returns:
+    A `RaggedTensor` containing the %(combined)s values.  The returned tensor
+    has the same dtype as `data`, and its shape is given by removing the
+    dimensions specified in `axis` from `rt_input.shape`.  The `ragged_rank`
+    of the returned tensor is given by substracting any ragged dimensions
+    specified in `axis` from `rt_input.ragged_rank`.
+  Raises:
+    ValueError: If `axis` contains a `Tensor` whose value is not constant.
+  ####Example:
+    ```python%(example)s    ```
+"""
+_RAGGED_REDUCE_SUM_EXAMPLE = """
+    >>> rt = ragged.constant([[3, 1, 4], [1, 5], [9], [2, 6]])
+    >>> ragged.reduce_sum(rt, axis=0).eval().tolist()
+    [15, 12, 4]  # = [3+1+9+2, 1+5+6, 4]
+    >>> ragged.reduce_sum(rt, axis=1).eval().tolist()
+    [8, 6, 9, 8]  # = [3+1+4, 1+5, 9, 2+6]
+"""
+_RAGGED_REDUCE_PROD_EXAMPLE = """
+    >>> rt = ragged.constant([[3, 1, 4], [1, 5], [9], [2, 6]])
+    >>> ragged.reduce_prod(rt, axis=0).eval().tolist()
+    [54, 30, 4]  # = [3*1*9*2, 1*5*6, 4]
+    >>> ragged.reduce_prod(rt, axis=1).eval().tolist()
+    [12, 5, 9, 12]  # = [3*1*4, 1*5, 9, 2*6]
+"""
+_RAGGED_REDUCE_MIN_EXAMPLE = """
+    >>> rt = ragged.constant([[3, 1, 4], [1, 5], [9], [2, 6]])
+    >>> ragged.reduce_min(rt, axis=0).eval().tolist()
+    [1, 1, 4]  # = [min(3, 1, 9, 2), min(1, 5, 6), 4]
+    >>> ragged.reduce_min(rt, axis=1).eval().tolist()
+    [1, 1, 9, 2]  # = [min(3, 1, 4), min(1, 5), 9, min(2, 6)]
+"""
+_RAGGED_REDUCE_MAX_EXAMPLE = """
+    >>> rt = ragged.constant([[3, 1, 4], [1, 5], [9], [2, 6]])
+    >>> ragged.reduce_max(rt, axis=0).eval().tolist()
+    [9, 6, 4]  # = [max(3, 1, 9, 2), max(1, 5, 6), 4]
+    >>> ragged.reduce_max(rt, axis=1).eval().tolist()
+    [4, 5, 9, 6]  # = [max(3, 1, 4), max(1, 5), 9, max(2, 6)]
+"""
+_RAGGED_REDUCE_MEAN_EXAMPLE = """
+    >>> rt = ragged.constant([[3, 1, 4], [1, 5], [9], [2, 6]])
+    >>> ragged.reduce_mean(rt, axis=0).eval().tolist()
+    [3.75, 4, 4]  # = [mean(3, 1, 9, 2), mean(1, 5, 6), 4]
+    >>> ragged.reduce_mean(rt, axis=1).eval().tolist()
+    [2.66666, 3, 9, 4]  # = [mean(3, 1, 4), mean(1, 5), 9, mean(2, 6)]
+"""
+_RAGGED_REDUCE_ALL_EXAMPLE = """
+    >>> rt = ragged.constant([[True, True], [True, True, False, True], [False, True]])
+    >>> ragged.reduce_all(rt, axis=0).eval().tolist()
+    [False, True, False, True]
+    >>> ragged.reduce_all(rt, axis=1).eval().tolist()
+    [True, False, False]
+"""
+_RAGGED_REDUCE_ANY_EXAMPLE = """
+    >>> rt = ragged.constant([[True, True], [True, True, False, True], [False, True]])
+    >>> ragged.reduce_any(rt, axis=0).eval().tolist()
+    [True, True, False, True]
+    >>> ragged.reduce_any(rt, axis=1).eval().tolist()
+    [True, True, True]
+"""
+
+
+def _ragged_reduce_aggregate(reduce_op, unsorted_segment_op, rt_input, axis,
+                             name=None):
+  """Aggregates across axes of a RaggedTensor using the given `Tensor` ops.
+
+  Reduces `rt_input` along the dimensions given in `axis`.  The rank of the
+  tensor is reduced by 1 for each entry in `axis`.  If `axis` is not specified,
+  then all dimensions are reduced, and a scalar value is returned.
+
+  This op assumes that `reduce_op` and `unsorted_segment_op` are associative;
+  if not, then reducing multiple axes will return incorrect results.  (In
+  particular, reducing multiple axes is currently implemented by reducing the
+  axes one at a time.)
+
+  Args:
+    reduce_op: The tensorflow `op` that should be used to reduce values in
+      uniform dimensions.  Must have the same signature and basic behavior as
+      `reduce_sum`, `reduce_max`, etc.
+    unsorted_segment_op: The tensorflow `op` that should be used to combine
+      values in ragged dimensions.  Must have the same signature and basic
+      behavior as `unsorted_segment_sum`, `unsorted_segment_max`, etc.
+    rt_input: A `Tensor` or `RaggedTensor` containing the values to be reduced.
+    axis: The axis or axes to reduce.  May be `None` (to reduce all axes), an
+      `int` (to reduce a single axis), a `list` or `tuple` of `int` (to reduce a
+      given set of axes), or a `Tensor` with a constant value.  Must be in the
+      range `[0, rt_input.rank)`.
+    name: A name prefix for the returned tensor (optional).
+
+  Returns:
+    A `RaggedTensor` containing the reduced values.  The returned tensor
+    has the same dtype as `data`, and its shape is given by removing the
+    dimensions specified in `axis` from `rt_input.shape`.  The `ragged_rank`
+    of the returned tensor is given by substracting any ragged dimensions
+    specified in `axis` from `rt_input.ragged_rank`.
+  Raises:
+    ValueError: If `axis` contains a `Tensor` whose value is not constant.
+  """
+  if not ragged_tensor.is_ragged(rt_input):
+    return reduce_op(rt_input, axis, name=name)
+
+  if isinstance(axis, ops.Tensor):
+    axis = tensor_util.constant_value(axis)
+    if axis is None:
+      raise ValueError('axis must be known at graph construction time.')
+
+  # When reducing all axes, just ignore splits & reduce the inner values.
+  if axis is None:
+    return reduce_op(rt_input.inner_values, None, name=name)
+
+  with ops.name_scope(name, 'RaggedReduce', [rt_input, axis]):
+    if isinstance(axis, (tuple, list)):
+      if not axis:
+        return rt_input
+      elif len(axis) == 1:
+        axis = axis[0]
+      else:
+        # When reducing multiple axes, just reduce one at a time.  This is less
+        # efficient, and only works for associative ops.  (In particular, it
+        # does not work for reduce_mean.)  However, reducing multiple axes at
+        # once will probably require a nontrivial c++ op.
+        axis = sorted(axis)
+        inner_reduced = _ragged_reduce_aggregate(reduce_op, unsorted_segment_op,
+                                                 rt_input, axis[-1])
+        return _ragged_reduce_aggregate(reduce_op, unsorted_segment_op,
+                                        inner_reduced, axis[:-1])
+
+    axis = ragged_util.get_positive_axis(axis, rt_input.shape.ndims)
+
+    rt_input = ragged_factory_ops.convert_to_tensor_or_ragged_tensor(
+        rt_input, name='rt_input')
+
+    if axis == 0:
+      # out[i_1, i_2, ..., i_N] = sum_{j} rt_input[j, i_1, i_2, ..., i_N]
+      row_lengths = rt_input.row_splits[1:] - rt_input.row_splits[:-1]
+      num_segments = math_ops.maximum(math_ops.reduce_max(row_lengths), 0)
+      segment_ids = range(row_lengths).values
+      return _ragged_segment_aggregate(unsorted_segment_op, rt_input.values,
+                                       segment_ids, num_segments)
+    elif axis == 1:
+      # out[i_0, i_1, i_2, ..., i_N] = sum_{j} rt_input[i_0, j, i_2, ..., i_N]
+      num_segments = array_ops.shape(rt_input.row_splits)[0] - 1
+      segment_ids = segment_id_ops.row_splits_to_segment_ids(
+          rt_input.row_splits)
+      return _ragged_segment_aggregate(unsorted_segment_op, rt_input.values,
+                                       segment_ids, num_segments)
+    else:
+      # out[i_0, ..., i_[axis-1], i_axis+1], ..., i_N] =
+      #     sum_{j} rt_input [i_0, ..., i_[axis-1], j, i_axis+1], ..., i_N]
+      return rt_input.with_values(
+          _ragged_reduce_aggregate(reduce_op, unsorted_segment_op,
+                                   rt_input.values, axis - 1))
+
+
+def reduce_sum(rt_input, axis=None, name=None):
+  """For docs, see: _RAGGED_REDUCE_DOCSTRING."""
+  return _ragged_reduce_aggregate(math_ops.reduce_sum,
+                                  math_ops.unsorted_segment_sum, rt_input, axis,
+                                  name or 'RaggedReduceSum')
+
+
+def reduce_prod(rt_input, axis=None, name=None):
+  """For docs, see: _RAGGED_REDUCE_DOCSTRING."""
+  return _ragged_reduce_aggregate(math_ops.reduce_prod,
+                                  math_ops.unsorted_segment_prod, rt_input,
+                                  axis, name or 'RaggedReduceProd')
+
+
+def reduce_min(rt_input, axis=None, name=None):
+  """For docs, see: _RAGGED_REDUCE_DOCSTRING."""
+  return _ragged_reduce_aggregate(math_ops.reduce_min,
+                                  math_ops.unsorted_segment_min, rt_input, axis,
+                                  name or 'RaggedReduceMin')
+
+
+def reduce_max(rt_input, axis=None, name=None):
+  """For docs, see: _RAGGED_REDUCE_DOCSTRING."""
+  return _ragged_reduce_aggregate(math_ops.reduce_max,
+                                  math_ops.unsorted_segment_max, rt_input, axis,
+                                  name or 'RaggedReduceMax')
+
+
+def reduce_mean(rt_input, axis=None, name=None):
+  """For docs, see: _RAGGED_REDUCE_DOCSTRING."""
+  with ops.name_scope(name, 'RaggedReduceMean', [rt_input, axis]):
+    total = reduce_sum(rt_input, axis)
+    if ragged_tensor.is_ragged(rt_input):
+      ones = ragged_factory_ops.from_nested_row_splits(
+          array_ops.ones_like(rt_input.inner_values),
+          rt_input.nested_row_splits)
+    else:
+      ones = array_ops.ones_like(rt_input)
+    count = reduce_sum(ones, axis)
+    if ragged_tensor.is_ragged(total):
+      return ragged_factory_ops.from_nested_row_splits(
+          total.inner_values / count.inner_values, total.nested_row_splits)
+    else:
+      return total / count
+
+
+def _cast(rt_input, dtype):
+  return ragged_functional_ops.map_inner_values(math_ops.cast, rt_input, dtype)
+
+
+def reduce_all(rt_input, axis=None, name=None):
+  """For docs, see: _RAGGED_REDUCE_DOCSTRING."""
+  with ops.name_scope(name, 'RaggedReduceAll', [rt_input, axis]):
+    return _cast(reduce_prod(_cast(rt_input, dtypes.int32), axis), dtypes.bool)
+
+
+def reduce_any(rt_input, axis=None, name=None):
+  """For docs, see: _RAGGED_REDUCE_DOCSTRING."""
+  with ops.name_scope(name, 'RaggedReduceAny', [rt_input, axis]):
+    return _cast(reduce_sum(_cast(rt_input, dtypes.int32), axis), dtypes.bool)
+
+
+def _set_ragged_reduce_docstring(func, combination, combined, default, example):
+  func.__doc__ = _RAGGED_REDUCE_DOCSTRING % dict(
+      combination=combination,
+      combined=combined,
+      default=default,
+      example=example)
+
+
+_set_ragged_reduce_docstring(reduce_sum, 'sum', 'summed', '0',
+                             _RAGGED_REDUCE_SUM_EXAMPLE)
+_set_ragged_reduce_docstring(reduce_prod, 'product', 'multiplied', '1',
+                             _RAGGED_REDUCE_PROD_EXAMPLE)
+_set_ragged_reduce_docstring(reduce_min, 'minimum', 'minimized',
+                             '`rt_input.dtype.min`', _RAGGED_REDUCE_MIN_EXAMPLE)
+_set_ragged_reduce_docstring(reduce_max, 'maximum', 'maximized',
+                             '`rt_input.dtype.max`', _RAGGED_REDUCE_MAX_EXAMPLE)
+_set_ragged_reduce_docstring(reduce_mean, 'mean', 'averaged', 'NaN',
+                             _RAGGED_REDUCE_MEAN_EXAMPLE)
+
+_set_ragged_reduce_docstring(reduce_all, 'logical and', 'and-ed', 'True',
+                             _RAGGED_REDUCE_ALL_EXAMPLE)
+_set_ragged_reduce_docstring(reduce_any, 'logical or', 'or-ed', 'False',
+                             _RAGGED_REDUCE_ANY_EXAMPLE)
diff --git a/tensorflow/python/ops/ragged/ragged_operators.py b/tensorflow/python/ops/ragged/ragged_operators.py
new file mode 100644
index 00000000000..223ba0d2e7f
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_operators.py
@@ -0,0 +1,79 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Operator overloads for `RaggedTensor`."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.ops.ragged import ragged_elementwise_ops
+from tensorflow.python.ops.ragged import ragged_getitem
+from tensorflow.python.ops.ragged import ragged_tensor
+from tensorflow.python.util import tf_decorator
+
+
+def _right(operator):
+  """Right-handed version of an operator: swap args x and y."""
+  return tf_decorator.make_decorator(operator, lambda y, x: operator(x, y))
+
+
+# Indexing
+ragged_tensor.RaggedTensor.__getitem__ = ragged_getitem.ragged_tensor_getitem
+
+# Ordering operators
+ragged_tensor.RaggedTensor.__ge__ = ragged_elementwise_ops.greater_equal
+ragged_tensor.RaggedTensor.__gt__ = ragged_elementwise_ops.greater
+ragged_tensor.RaggedTensor.__le__ = ragged_elementwise_ops.less_equal
+ragged_tensor.RaggedTensor.__lt__ = ragged_elementwise_ops.less
+
+# Logical operators
+ragged_tensor.RaggedTensor.__and__ = ragged_elementwise_ops.logical_and
+ragged_tensor.RaggedTensor.__rand__ = _right(ragged_elementwise_ops.logical_and)
+ragged_tensor.RaggedTensor.__invert__ = ragged_elementwise_ops.logical_not
+ragged_tensor.RaggedTensor.__ror__ = _right(ragged_elementwise_ops.logical_or)
+ragged_tensor.RaggedTensor.__or__ = ragged_elementwise_ops.logical_or
+ragged_tensor.RaggedTensor.__xor__ = ragged_elementwise_ops.logical_xor
+ragged_tensor.RaggedTensor.__rxor__ = _right(ragged_elementwise_ops.logical_xor)
+
+# Arithmetic operators
+ragged_tensor.RaggedTensor.__abs__ = ragged_elementwise_ops.abs
+ragged_tensor.RaggedTensor.__add__ = ragged_elementwise_ops.add
+ragged_tensor.RaggedTensor.__radd__ = _right(ragged_elementwise_ops.add)
+ragged_tensor.RaggedTensor.__div__ = ragged_elementwise_ops.div
+ragged_tensor.RaggedTensor.__rdiv__ = _right(ragged_elementwise_ops.div)
+ragged_tensor.RaggedTensor.__floordiv__ = ragged_elementwise_ops.floordiv
+ragged_tensor.RaggedTensor.__rfloordiv__ = _right(
+    ragged_elementwise_ops.floordiv)
+ragged_tensor.RaggedTensor.__mod__ = ragged_elementwise_ops.floormod
+ragged_tensor.RaggedTensor.__rmod__ = _right(ragged_elementwise_ops.floormod)
+ragged_tensor.RaggedTensor.__mul__ = ragged_elementwise_ops.multiply
+ragged_tensor.RaggedTensor.__rmul__ = _right(ragged_elementwise_ops.multiply)
+ragged_tensor.RaggedTensor.__neg__ = ragged_elementwise_ops.negative
+ragged_tensor.RaggedTensor.__pow__ = ragged_elementwise_ops.pow
+ragged_tensor.RaggedTensor.__rpow__ = _right(ragged_elementwise_ops.pow)
+ragged_tensor.RaggedTensor.__sub__ = ragged_elementwise_ops.subtract
+ragged_tensor.RaggedTensor.__rsub__ = _right(ragged_elementwise_ops.subtract)
+ragged_tensor.RaggedTensor.__truediv__ = ragged_elementwise_ops.truediv
+ragged_tensor.RaggedTensor.__rtruediv__ = _right(ragged_elementwise_ops.truediv)
+
+
+# Dummy methods
+def _dummy_bool(_):
+  """Dummy method to prevent a RaggedTensor from being used as a Python bool."""
+  raise TypeError("RaggedTensor may not be used as a boolean.")
+
+
+ragged_tensor.RaggedTensor.__bool__ = _dummy_bool
+ragged_tensor.RaggedTensor.__nonzero__ = _dummy_bool
diff --git a/tensorflow/python/ops/ragged/ragged_operators_test.py b/tensorflow/python/ops/ragged/ragged_operators_test.py
new file mode 100644
index 00000000000..a99d788ef79
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_operators_test.py
@@ -0,0 +1,108 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for overloaded RaggedTensor operators."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import ragged
+from tensorflow.python.platform import googletest
+
+
+class RaggedElementwiseOpsTest(test_util.TensorFlowTestCase):
+  # @TODO(edloper): Test right-handed versions of operators once we add
+  # broadcasting support for elementwise ops.
+
+  def testOrderingOperators(self):
+    x = ragged.constant([[1, 5], [3]])
+    y = ragged.constant([[4, 5], [1]])
+    with self.test_session():
+      self.assertEqual((x > y).eval().tolist(), [[False, False], [True]])
+      self.assertEqual((x >= y).eval().tolist(), [[False, True], [True]])
+      self.assertEqual((x < y).eval().tolist(), [[True, False], [False]])
+      self.assertEqual((x <= y).eval().tolist(), [[True, True], [False]])
+
+  def assertEqual(self, a, b):
+    if a != b:
+      print('%30s %s' % (b, a))
+
+  def testArithmeticOperators(self):
+    x = ragged.constant([[1.0, -2.0], [8.0]])
+    y = ragged.constant([[4.0, 4.0], [2.0]])
+    with self.test_session():
+      self.assertEqual(abs(x).eval().tolist(), [[1.0, 2.0], [8.0]])
+
+      self.assertEqual((-x).eval().tolist(), [[-1.0, 2.0], [-8.0]])
+
+      self.assertEqual((x + y).eval().tolist(), [[5.0, 2.0], [10.0]])
+      self.assertEqual((3.0 + y).eval().tolist(), [[7.0, 7.0], [5.0]])
+      self.assertEqual((x + 3.0).eval().tolist(), [[4.0, 1.0], [11.0]])
+
+      self.assertEqual((x - y).eval().tolist(), [[-3.0, -6.0], [6.0]])
+      self.assertEqual((3.0 - y).eval().tolist(), [[-1.0, -1.0], [1.0]])
+      self.assertEqual((x + 3.0).eval().tolist(), [[4.0, 1.0], [11.0]])
+
+      self.assertEqual((x * y).eval().tolist(), [[4.0, -8.0], [16.0]])
+      self.assertEqual((3.0 * y).eval().tolist(), [[12.0, 12.0], [6.0]])
+      self.assertEqual((x * 3.0).eval().tolist(), [[3.0, -6.0], [24.0]])
+
+      self.assertEqual((x / y).eval().tolist(), [[0.25, -0.5], [4.0]])
+      self.assertEqual((y / x).eval().tolist(), [[4.0, -2.0], [0.25]])
+      self.assertEqual((2.0 / y).eval().tolist(), [[0.5, 0.5], [1.0]])
+      self.assertEqual((x / 2.0).eval().tolist(), [[0.5, -1.0], [4.0]])
+
+      self.assertEqual((x // y).eval().tolist(), [[0.0, -1.0], [4.0]])
+      self.assertEqual((y // x).eval().tolist(), [[4.0, -2.0], [0.0]])
+      self.assertEqual((2.0 // y).eval().tolist(), [[0.0, 0.0], [1.0]])
+      self.assertEqual((x // 2.0).eval().tolist(), [[0.0, -1.0], [4.0]])
+
+      self.assertEqual((x % y).eval().tolist(), [[1.0, 2.0], [0.0]])
+      self.assertEqual((y % x).eval().tolist(), [[0.0, -0.0], [2.0]])
+      self.assertEqual((2.0 % y).eval().tolist(), [[2.0, 2.0], [0.0]])
+      self.assertEqual((x % 2.0).eval().tolist(), [[1.0, 0.0], [0.0]])
+
+  def testLogicalOperators(self):
+    a = ragged.constant([[True, True], [False]])
+    b = ragged.constant([[True, False], [False]])
+    with self.test_session():
+      self.assertEqual((~a).eval().tolist(), [[False, False], [True]])
+
+      self.assertEqual((a & b).eval().tolist(), [[True, False], [False]])
+      self.assertEqual((a & True).eval().tolist(), [[True, True], [False]])
+      self.assertEqual((True & b).eval().tolist(), [[True, False], [False]])
+
+      self.assertEqual((a | b).eval().tolist(), [[True, True], [False]])
+      self.assertEqual((a | False).eval().tolist(), [[True, True], [False]])
+      self.assertEqual((False | b).eval().tolist(), [[True, False], [False]])
+
+      self.assertEqual((a ^ b).eval().tolist(), [[False, True], [False]])
+      self.assertEqual((a ^ True).eval().tolist(), [[False, False], [True]])
+      self.assertEqual((True ^ b).eval().tolist(), [[False, True], [True]])
+
+  def testDummyOperators(self):
+    a = ragged.constant([[True, True], [False]])
+    with self.assertRaisesRegexp(TypeError,
+                                 'RaggedTensor may not be used as a boolean.'):
+      bool(a)
+    with self.assertRaisesRegexp(TypeError,
+                                 'RaggedTensor may not be used as a boolean.'):
+      if a:
+        pass
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/python/ops/ragged/ragged_range_op_test.py b/tensorflow/python/ops/ragged/ragged_range_op_test.py
new file mode 100644
index 00000000000..3c6a6fb75c8
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_range_op_test.py
@@ -0,0 +1,124 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for ragged_range op."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import ragged
+from tensorflow.python.platform import googletest
+
+
+class RaggedRangeOpTest(test_util.TensorFlowTestCase):
+
+  def testDocStringExamples(self):
+    """Examples from ragged_range.__doc__."""
+    with self.test_session():
+      rt1 = ragged.range([3, 5, 2]).eval().tolist()
+      self.assertEqual(rt1, [[0, 1, 2], [0, 1, 2, 3, 4], [0, 1]])
+
+      rt2 = ragged.range([0, 5, 8], [3, 3, 12]).eval().tolist()
+      self.assertEqual(rt2, [[0, 1, 2], [], [8, 9, 10, 11]])
+
+      rt3 = ragged.range([0, 5, 8], [3, 3, 12], 2).eval().tolist()
+      self.assertEqual(rt3, [[0, 2], [], [8, 10]])
+
+  def testBasicRanges(self):
+    with self.test_session():
+      # Specify limits only.
+      self.assertEqual(
+          ragged.range([0, 3, 5]).eval().tolist(),
+          [list(range(0)), list(range(3)), list(range(5))])
+
+      # Specify starts and limits.
+      self.assertEqual(
+          ragged.range([0, 3, 5], [2, 3, 10]).eval().tolist(),
+          [list(range(0, 2)), list(range(3, 3)), list(range(5, 10))])
+
+      # Specify starts, limits, and deltas.
+      self.assertEqual(
+          ragged.range([0, 3, 5], [4, 4, 15], [2, 3, 4]).eval().tolist(),
+          [list(range(0, 4, 2)), list(range(3, 4, 3)),
+           list(range(5, 15, 4))])
+
+  def testFloatRanges(self):
+    with self.test_session():
+      expected = [[0.0, 0.4, 0.8, 1.2, 1.6, 2.0, 2.4, 2.8, 3.2, 3.6], [3.0],
+                  [5.0, 7.2, 9.4, 11.6, 13.8]]
+      actual = ragged.range([0.0, 3.0, 5.0], [3.9, 4.0, 15.0],
+                            [0.4, 1.5, 2.2]).eval().tolist()
+      self.assertEqual(expected, [[round(v, 5) for v in row] for row in actual])
+
+  def testNegativeDeltas(self):
+    with self.test_session():
+      self.assertEqual(
+          ragged.range([0, 3, 5], limits=0, deltas=-1).eval().tolist(),
+          [list(range(0, 0, -1)), list(range(3, 0, -1)),
+           list(range(5, 0, -1))])
+
+      self.assertEqual(
+          ragged.range([0, -3, 5], limits=0, deltas=[-1, 1,
+                                                     -2]).eval().tolist(),
+          [list(range(0, 0, -1)), list(range(-3, 0, 1)),
+           list(range(5, 0, -2))])
+
+  def testBroadcast(self):
+    with self.test_session():
+      # Specify starts and limits, broadcast deltas.
+      self.assertEqual(
+          ragged.range([0, 3, 5], [4, 4, 15], 3).eval().tolist(),
+          [list(range(0, 4, 3)), list(range(3, 4, 3)),
+           list(range(5, 15, 3))])
+
+      # Broadcast all arguments.
+      self.assertEqual(
+          ragged.range(0, 5, 1).eval().tolist(), [list(range(0, 5, 1))])
+
+  def testEmptyRanges(self):
+    rt1 = ragged.range([0, 5, 3], [0, 3, 5])
+    rt2 = ragged.range([0, 5, 5], [0, 3, 5], -1)
+    with self.test_session():
+      self.assertEqual(rt1.eval().tolist(), [[], [], [3, 4]])
+      self.assertEqual(rt2.eval().tolist(), [[], [5, 4], []])
+
+  def testShapeFnErrors(self):
+    with self.test_session():
+      self.assertRaisesRegexp(ValueError, r'Shape must be at most rank 1.*',
+                              ragged.range, [[0]], 5)
+      self.assertRaisesRegexp(ValueError, r'Shape must be at most rank 1.*',
+                              ragged.range, 0, [[5]])
+      self.assertRaisesRegexp(ValueError, r'Shape must be at most rank 1.*',
+                              ragged.range, 0, 5, [[0]])
+      self.assertRaisesRegexp(ValueError, r'Dimensions must be equal.*',
+                              ragged.range, [0], [1, 2])
+
+  def testKernelErrors(self):
+    with self.test_session():
+      self.assertRaisesRegexp(errors.InvalidArgumentError,
+                              r'Requires delta != 0',
+                              ragged.range(0, 0, 0).eval)
+
+  def testShape(self):
+    self.assertEqual(ragged.range(0, 0, 0).shape.as_list(), [1, None])
+    self.assertEqual(ragged.range([1, 2, 3]).shape.as_list(), [3, None])
+    self.assertEqual(
+        ragged.range([1, 2, 3], [4, 5, 6]).shape.as_list(), [3, None])
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/python/ops/ragged/ragged_reduce_op_test.py b/tensorflow/python/ops/ragged/ragged_reduce_op_test.py
new file mode 100644
index 00000000000..93176c738df
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_reduce_op_test.py
@@ -0,0 +1,343 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for ragged.reduce_<AGGREGATE> ops."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import ragged
+from tensorflow.python.platform import googletest
+
+_MAX_INT32 = dtypes.int32.max
+_MIN_INT32 = dtypes.int32.min
+_NAN = np.nan
+
+
+def mean(*values):
+  return 1.0 * sum(values) / len(values)
+
+
+class RaggedReduceOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
+
+  @parameterized.parameters(
+      #=========================================================================
+      # Docstring examples.  RaggedTensor for testing is:
+      #   [[3, 1, 4],
+      #    [1, 5,  ],
+      #    [9,     ],
+      #    [2, 6   ]]
+      #=========================================================================
+      dict(
+          ragged_reduce_op=ragged.reduce_sum,
+          rt_input=[[3, 1, 4], [1, 5], [9], [2, 6]],
+          axis=0,
+          expected=[15, 12, 4]  # = [3+1+9+2, 1+5+6, 4]
+      ),
+      dict(
+          ragged_reduce_op=ragged.reduce_sum,
+          rt_input=[[3, 1, 4], [1, 5], [9], [2, 6]],
+          axis=-2,
+          expected=[15, 12, 4]  # = [3+1+9+2, 1+5+6, 4]
+      ),
+      dict(
+          ragged_reduce_op=ragged.reduce_sum,
+          rt_input=[[3, 1, 4], [1, 5], [9], [2, 6]],
+          axis=1,
+          expected=[8, 6, 9, 8]  # = [3+1+4, 1+5, 9, 2+6]
+      ),
+      dict(
+          ragged_reduce_op=ragged.reduce_sum,
+          rt_input=[[3, 1, 4], [1, 5], [9], [2, 6]],
+          axis=-1,
+          expected=[8, 6, 9, 8]  # = [3+1+4, 1+5, 9, 2+6]
+      ),
+      dict(
+          ragged_reduce_op=ragged.reduce_prod,
+          rt_input=[[3, 1, 4], [1, 5], [9], [2, 6]],
+          axis=0,
+          expected=[54, 30, 4]  # = [3*1*9*2, 1*5*6, 4]
+      ),
+      dict(
+          ragged_reduce_op=ragged.reduce_prod,
+          rt_input=[[3, 1, 4], [1, 5], [9], [2, 6]],
+          axis=1,
+          expected=[12, 5, 9, 12]  # = [3*1*4, 1*5, 9, 2*6]
+      ),
+      dict(
+          ragged_reduce_op=ragged.reduce_min,
+          rt_input=[[3, 1, 4], [1, 5], [9], [2, 6]],
+          axis=0,
+          expected=[1, 1, 4]  # = [min(3, 1, 9, 2), min(1, 5, 6), 4]
+      ),
+      dict(
+          ragged_reduce_op=ragged.reduce_min,
+          rt_input=[[3, 1, 4], [1, 5], [9], [2, 6]],
+          axis=1,
+          expected=[1, 1, 9, 2]  # = [min(3, 1, 4), min(1, 5), 9, min(2, 6)]
+      ),
+      dict(
+          ragged_reduce_op=ragged.reduce_max,
+          rt_input=[[3, 1, 4], [1, 5], [9], [2, 6]],
+          axis=0,
+          expected=[9, 6, 4]  # = [max(3, 1, 9, 2), max(1, 5, 6), 4]
+      ),
+      dict(
+          ragged_reduce_op=ragged.reduce_max,
+          rt_input=[[3, 1, 4], [1, 5], [9], [2, 6]],
+          axis=1,
+          expected=[4, 5, 9, 6]  # = [max(3, 1, 4), max(1, 5), 9, max(2, 6)]
+      ),
+      dict(
+          ragged_reduce_op=ragged.reduce_mean,
+          rt_input=[[3, 1, 4], [1, 5], [9], [2, 6]],
+          axis=0,
+          expected=[3.75, 4, 4]  # = [mean(3, 1, 9, 2), mean(1, 5, 6), 4]
+      ),
+      dict(
+          ragged_reduce_op=ragged.reduce_any,
+          rt_input=[[True, True], [True, True, False, True], [False, True]],
+          axis=0,
+          expected=[True, True, False, True]),
+      dict(
+          ragged_reduce_op=ragged.reduce_any,
+          rt_input=[[True, True], [True, True, False, True], [False, True]],
+          axis=1,
+          expected=[True, True, True]),
+      dict(
+          ragged_reduce_op=ragged.reduce_all,
+          rt_input=[[True, True], [True, True, False, True], [False, True]],
+          axis=0,
+          expected=[False, True, False, True]),
+      dict(
+          ragged_reduce_op=ragged.reduce_all,
+          rt_input=[[True, True], [True, True, False, True], [False, True]],
+          axis=1,
+          expected=[True, False, False]),
+
+      #=========================================================================
+      # Examples with the following RaggedTensor (ragged_rank=1):
+      #   [[0, 1, 2, 3],
+      #    [4         ],
+      #    [          ],
+      #    [5, 6      ],
+      #    [7         ],
+      #    [8, 9      ]]
+      #=========================================================================
+
+      # axis=None
+      dict(
+          ragged_reduce_op=ragged.reduce_sum,
+          rt_input=[[0, 1, 2, 3], [4], [], [5, 6], [7], [8, 9]],
+          axis=None,
+          expected=0 + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9),
+      dict(
+          ragged_reduce_op=ragged.reduce_prod,
+          rt_input=[[0, 1, 2, 3], [4], [], [5, 6], [7], [8, 9]],
+          axis=None,
+          expected=0 * 1 * 2 * 3 * 4 * 5 * 6 * 7 * 8 * 9),
+      dict(
+          ragged_reduce_op=ragged.reduce_min,
+          rt_input=[[0, 1, 2, 3], [4], [], [5, 6], [7], [8, 9]],
+          axis=None,
+          expected=min(0, 1, 2, 3, 4, 5, 6, 7, 8, 9)),
+      dict(
+          ragged_reduce_op=ragged.reduce_max,
+          rt_input=[[0, 1, 2, 3], [4], [], [5, 6], [7], [8, 9]],
+          axis=None,
+          expected=max(0, 1, 2, 3, 4, 5, 6, 7, 8, 9)),
+      dict(
+          ragged_reduce_op=ragged.reduce_mean,
+          rt_input=[[0, 1, 2, 3], [4], [], [5, 6], [7], [8, 9]],
+          axis=None,
+          expected=mean(0, 1, 2, 3, 4, 5, 6, 7, 8, 9)),
+      # axis=0
+      dict(
+          ragged_reduce_op=ragged.reduce_sum,
+          rt_input=[[0, 1, 2, 3], [4], [], [5, 6], [7], [8, 9]],
+          axis=0,
+          expected=[0 + 4 + 5 + 7 + 8, 1 + 6 + 9, 2, 3]),
+      dict(
+          ragged_reduce_op=ragged.reduce_prod,
+          rt_input=[[0, 1, 2, 3], [4], [], [5, 6], [7], [8, 9]],
+          axis=0,
+          expected=[0 * 4 * 5 * 7 * 8, 1 * 6 * 9, 2, 3]),
+      dict(
+          ragged_reduce_op=ragged.reduce_min,
+          rt_input=[[0, 1, 2, 3], [4], [], [5, 6], [7], [8, 9]],
+          axis=0,
+          expected=[min(0, 4, 5, 7, 8), min(1, 6, 9), 2, 3]),
+      dict(
+          ragged_reduce_op=ragged.reduce_max,
+          rt_input=[[0, 1, 2, 3], [4], [], [5, 6], [7], [8, 9]],
+          axis=0,
+          expected=[max(0, 4, 5, 7, 8), max(1, 6, 9), 2, 3]),
+      dict(
+          ragged_reduce_op=ragged.reduce_mean,
+          rt_input=[[0, 1, 2, 3], [4], [], [5, 6], [7], [8, 9]],
+          axis=0,
+          expected=[mean(0, 4, 5, 7, 8),
+                    mean(1, 6, 9), 2, 3]),
+      # axis=1
+      # Note: we don't test mean here because it gives a NaN, and this will
+      # cause assertEqual to fail (since NaN != NaN).  See testMeanNan().
+      dict(
+          ragged_reduce_op=ragged.reduce_sum,
+          rt_input=[[0, 1, 2, 3], [4], [], [5, 6], [7], [8, 9]],
+          axis=1,
+          expected=[0 + 1 + 2 + 3, 4, 0, 5 + 6, 7, 8 + 9]),
+      dict(
+          ragged_reduce_op=ragged.reduce_prod,
+          rt_input=[[0, 1, 2, 3], [4], [], [5, 6], [7], [8, 9]],
+          axis=1,
+          expected=[0 * 1 * 2 * 3, 4, 1, 5 * 6, 7, 8 * 9]),
+      dict(
+          ragged_reduce_op=ragged.reduce_min,
+          rt_input=[[0, 1, 2, 3], [4], [], [5, 6], [7], [8, 9]],
+          axis=1,
+          expected=[min(0, 1, 2, 3), 4, _MAX_INT32,
+                    min(5, 6), 7,
+                    min(8, 9)]),
+      dict(
+          ragged_reduce_op=ragged.reduce_max,
+          rt_input=[[0, 1, 2, 3], [4], [], [5, 6], [7], [8, 9]],
+          axis=1,
+          expected=[max(0, 1, 2, 3), 4, _MIN_INT32,
+                    max(5, 6), 7,
+                    max(8, 9)]),
+
+      #=========================================================================
+      # Examples with ragged_rank=2:
+      # [[[1, 2], [ ], [3, 4, 5]],
+      #  [[6, 7], [ ], [8      ]],
+      #  [                      ],
+      #  [[9   ]                ]]
+      #=========================================================================
+      dict(
+          ragged_reduce_op=ragged.reduce_sum,
+          rt_input=[[[1, 2], [], [3, 4, 5]], [[6, 7], [], [8]], [], [[9]]],
+          axis=[],
+          expected=[[[1, 2], [], [3, 4, 5]], [[6, 7], [], [8]], [], [[9]]]),
+      dict(
+          ragged_reduce_op=ragged.reduce_sum,
+          rt_input=[[[1, 2], [], [3, 4, 5]], [[6, 7], [], [8]], [], [[9]]],
+          axis=None,
+          expected=sum([1, 2, 3, 4, 5, 6, 7, 8, 9])),
+      dict(
+          ragged_reduce_op=ragged.reduce_sum,
+          rt_input=[[[1, 2], [], [3, 4, 5]], [[6, 7], [], [8]], [], [[9]]],
+          axis=0,
+          expected=[[1 + 6 + 9, 2 + 7], [], [3 + 8, 4, 5]]),
+      dict(
+          ragged_reduce_op=ragged.reduce_sum,
+          rt_input=[[[1, 2], [], [3, 4, 5]], [[6, 7], [], [8]], [], [[9]]],
+          axis=1,
+          expected=[[1 + 3, 2 + 4, 5], [6 + 8, 7], [], [9]]),
+      dict(
+          ragged_reduce_op=ragged.reduce_sum,
+          rt_input=[[[1, 2], [], [3, 4, 5]], [[6, 7], [], [8]], [], [[9]]],
+          axis=2,
+          expected=[[1 + 2, 0, 3 + 4 + 5], [6 + 7, 0, 8], [], [9]]),
+      dict(
+          ragged_reduce_op=ragged.reduce_sum,
+          rt_input=[[[1, 2], [], [3, 4, 5]], [[6, 7], [], [8]], [], [[9]]],
+          axis=[0, 1],
+          expected=[1 + 3 + 6 + 8 + 9, 2 + 4 + 7, 5]),
+      dict(
+          ragged_reduce_op=ragged.reduce_sum,
+          rt_input=[[[1, 2], [], [3, 4, 5]], [[6, 7], [], [8]], [], [[9]]],
+          axis=[0, 2],
+          expected=[1 + 6 + 9 + 2 + 7, 0, 3 + 8 + 4 + 5]),
+      dict(
+          ragged_reduce_op=ragged.reduce_sum,
+          rt_input=[[[1, 2], [], [3, 4, 5]], [[6, 7], [], [8]], [], [[9]]],
+          axis=[1, 2],
+          expected=[1 + 2 + 3 + 4 + 5, 6 + 7 + 8, 0, 9]),
+      dict(
+          ragged_reduce_op=ragged.reduce_sum,
+          rt_input=[[[1, 2], [], [3, 4, 5]], [[6, 7], [], [8]], [], [[9]]],
+          axis=[0, 1, 2],
+          expected=sum([1, 2, 3, 4, 5, 6, 7, 8, 9])),
+
+      #=========================================================================
+      # Examples for ragged_reduce_mean ragged_rank=2:
+      # [[[1, 2], [3, 4, 5]],
+      #  [[6, 7], [8      ]],
+      #  [[9   ]          ]]
+      #=========================================================================
+      dict(
+          ragged_reduce_op=ragged.reduce_mean,
+          rt_input=[[[1, 2], [3, 4, 5]], [[6, 7], [8]], [[9]]],
+          axis=0,
+          expected=[[mean(1, 6, 9), mean(2, 7)], [mean(3, 8), 4, 5]]),
+      dict(
+          ragged_reduce_op=ragged.reduce_mean,
+          rt_input=[[[1, 2], [3, 4, 5]], [[6, 7], [8]], [[9]]],
+          axis=1,
+          expected=[[mean(1, 3), mean(2, 4), 5], [mean(6, 8), 7], [9]]),
+      dict(
+          ragged_reduce_op=ragged.reduce_mean,
+          rt_input=[[[1, 2], [3, 4, 5]], [[6, 7], [8]], [[9]]],
+          axis=2,
+          expected=[[mean(1, 2), mean(3, 4, 5)], [mean(6, 7), 8], [9]]),
+  )
+  def testReduce(self, ragged_reduce_op, rt_input, axis, expected):
+    rt_input = ragged.constant(rt_input)
+    reduced = ragged_reduce_op(rt_input, axis)
+    with self.test_session():
+      self.assertEqual(reduced.eval().tolist(), expected)
+
+  def assertEqualWithNan(self, actual, expected):
+    """Like assertEqual, but NaN==NaN."""
+    self.assertTrue(
+        ((actual == expected) | (np.isnan(actual) & np.isnan(expected))).all())
+
+  def testMeanNan(self):
+    rt_as_list = [[0, 1, 2, 3], [4], [], [5, 6], [7], [8, 9]]
+    expected = (
+        np.array([0 + 1 + 2 + 3, 4, 0, 5 + 6, 7, 8 + 9]) / np.array(
+            [4, 1, 0, 2, 1, 2]))
+    rt_input = ragged.constant(rt_as_list)
+    reduced = ragged.reduce_mean(rt_input, axis=1)
+    with self.test_session():
+      self.assertEqualWithNan(reduced.eval(), expected)
+
+  def testMeanWithTensorInputs(self):
+    tensor = [[1.0, 2.0, 3.0], [10.0, 20.0, 30.0]]
+    expected = [2.0, 20.0]
+    reduced = ragged.reduce_mean(tensor, axis=1)
+    with self.test_session():
+      self.assertAllEqual(reduced.eval(), expected)
+
+  def testErrors(self):
+    rt_input = ragged.constant([[1, 2, 3], [4, 5]])
+    axis = array_ops.placeholder_with_default(constant_op.constant([0]), None)
+    self.assertRaisesRegexp(ValueError,
+                            r'axis must be known at graph construction time.',
+                            ragged.reduce_sum, rt_input, axis)
+    self.assertRaisesRegexp(TypeError,
+                            r'axis must be an int; got str.*',
+                            ragged.reduce_sum, rt_input, ['x'])
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/python/ops/ragged/ragged_row_lengths_op_test.py b/tensorflow/python/ops/ragged/ragged_row_lengths_op_test.py
new file mode 100644
index 00000000000..4d5a0a5d11c
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_row_lengths_op_test.py
@@ -0,0 +1,184 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for ragged.row_lengths."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import ragged
+from tensorflow.python.platform import googletest
+
+
+class RaggedRowLengthsOp(test_util.TensorFlowTestCase, parameterized.TestCase):
+
+  @parameterized.parameters([
+      # Docstring Example
+      dict(
+          rt_input=[[[3, 1, 4], [1]], [], [[5, 9], [2]], [[6]], []],
+          expected=[2, 0, 2, 1, 0]),
+      dict(
+          rt_input=[[[3, 1, 4], [1]], [], [[5, 9], [2]], [[6]], []],
+          axis=2,
+          expected=[[3, 1], [], [2, 1], [1], []]),
+
+      # 1D tensor
+      dict(
+          rt_input=[1, 2, 3, 4, 5],
+          ragged_rank=0,
+          axis=0,
+          expected=5),
+
+      # 2D Tensor (0 ragged dimensions)
+      dict(
+          rt_input=[[1, 2], [3, 4], [5, 6], [7, 8]],
+          ragged_rank=0,
+          expected=[2, 2, 2, 2]),
+      dict(
+          rt_input=[[1, 2], [3, 4], [5, 6], [7, 8]],
+          ragged_rank=0,
+          axis=0,
+          expected=4),
+
+      # 2D Tensor (1 ragged dimension)
+      dict(
+          rt_input=[['a'], ['b', 'c', 'd'], ['e'], [], ['f']],
+          expected=[1, 3, 1, 0, 1]),
+      dict(
+          rt_input=[['a'], ['b', 'c', 'd'], ['e'], [], ['f']],
+          axis=0,
+          expected=5),
+      dict(
+          rt_input=[['a', 'b', 'c', 'd', 'e', 'f', 'g']],
+          expected=[7]),
+      dict(
+          rt_input=[[], ['a', 'b', 'c', 'd', 'e', 'f', 'g'], []],
+          expected=[0, 7, 0]),
+      dict(
+          rt_input=[],
+          ragged_rank=1,
+          expected=[]),
+      dict(
+          rt_input=[],
+          ragged_rank=1,
+          axis=0,
+          expected=0),
+
+      # 3D Tensor (0 ragged dimensions)
+      dict(
+          rt_input=[[[1, 2], [3, 4], [5, 6]], [[7, 8], [9, 10], [11, 12]]],
+          ragged_rank=0,
+          axis=0,
+          expected=2),
+      dict(
+          rt_input=[[[1, 2], [3, 4], [5, 6]], [[7, 8], [9, 10], [11, 12]]],
+          ragged_rank=0,
+          axis=1,
+          expected=[3, 3]),
+      dict(
+          rt_input=[[[1, 2], [3, 4], [5, 6]], [[7, 8], [9, 10], [11, 12]]],
+          ragged_rank=0,
+          axis=2,
+          expected=[[2, 2, 2], [2, 2, 2]],
+          expected_ragged_rank=0),
+
+      # 3D Tensor (1 ragged dimension)
+      dict(
+          rt_input=[[[1, 2], [3, 4], [5, 6]], [[7, 8], [9, 10]]],
+          ragged_rank=1,
+          axis=0,
+          expected=2),
+      dict(
+          rt_input=[[[1, 2], [3, 4], [5, 6]], [[7, 8], [9, 10]]],
+          ragged_rank=1,
+          axis=1,
+          expected=[3, 2]),
+      dict(
+          rt_input=[[[1, 2], [3, 4], [5, 6]], [[7, 8], [9, 10]]],
+          ragged_rank=1,
+          axis=2,
+          expected=[[2, 2, 2], [2, 2]],
+          expected_ragged_rank=1),
+
+      # 3D Tensor (2 ragged dimensions)
+      dict(
+          rt_input=[[[1, 2], [3, 4, 5], []], [[6, 7, 8, 9], [10]]],
+          axis=0,
+          expected=2),
+      dict(
+          rt_input=[[[1, 2], [3, 4, 5], []], [[6, 7, 8, 9], [10]]],
+          axis=-3,
+          expected=2),
+      dict(
+          rt_input=[[[1, 2], [3, 4, 5], []], [[6, 7, 8, 9], [10]]],
+          axis=1,
+          expected=[3, 2]),
+      dict(
+          rt_input=[[[1, 2], [3, 4, 5], []], [[6, 7, 8, 9], [10]]],
+          axis=-2,
+          expected=[3, 2]),
+      dict(
+          rt_input=[[[1, 2], [3, 4, 5], []], [[6, 7, 8, 9], [10]]],
+          axis=2,
+          expected=[[2, 3, 0], [4, 1]],
+          expected_ragged_rank=1),
+      dict(
+          rt_input=[[[1, 2], [3, 4, 5], []], [[6, 7, 8, 9], [10]]],
+          axis=-1,
+          expected=[[2, 3, 0], [4, 1]],
+          expected_ragged_rank=1),
+  ])  # pyformat: disable
+  def testRowLengths(self,
+                     rt_input,
+                     expected,
+                     axis=1,
+                     ragged_rank=None,
+                     expected_ragged_rank=None):
+    rt = ragged.constant(rt_input, ragged_rank=ragged_rank)
+    lengths = ragged.row_lengths(rt, axis)
+    with self.test_session():
+      self.assertEqual(lengths.eval().tolist(), expected)
+      if expected_ragged_rank is not None:
+        if isinstance(lengths, ragged.RaggedTensor):
+          self.assertEqual(lengths.ragged_rank, expected_ragged_rank)
+        else:
+          self.assertEqual(0, expected_ragged_rank)
+
+  @parameterized.parameters([
+      dict(
+          rt_input=10,
+          exception=ValueError,
+          message='rt_input may not be a scalar.'),
+      dict(
+          rt_input=[10, 20],
+          axis=1,
+          exception=ValueError,
+          message='axis=1 out of bounds: expected -1<=axis<1.'),
+      dict(
+          rt_input=[[2, 3, 0], [4, 1, 2]],
+          axis=-3,
+          exception=ValueError,
+          message='axis=-3 out of bounds: expected -2<=axis<2.'),
+  ])
+  def testErrors(self, rt_input, exception, message, axis=1):
+    with self.assertRaisesRegexp(exception, message):
+      ragged.row_lengths(rt_input, axis)
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/python/ops/ragged/ragged_row_splits_to_segment_ids_op_test.py b/tensorflow/python/ops/ragged/ragged_row_splits_to_segment_ids_op_test.py
new file mode 100644
index 00000000000..f246bf35524
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_row_splits_to_segment_ids_op_test.py
@@ -0,0 +1,56 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the ragged.row_splits_to_segment_ids() op."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import ragged
+from tensorflow.python.platform import googletest
+
+
+class RaggedSplitsToSegmentIdsOpTest(test_util.TensorFlowTestCase):
+
+  def testDocStringExample(self):
+    splits = [0, 3, 3, 5, 6, 9]
+    expected = [0, 0, 0, 2, 2, 3, 4, 4, 4]
+    segment_ids = ragged.row_splits_to_segment_ids(splits)
+    with self.test_session():
+      self.assertEqual(segment_ids.eval().tolist(), expected)
+
+  def testEmptySplits(self):
+    # Note: the splits for an empty ragged tensor contains a single zero.
+    segment_ids = ragged.row_splits_to_segment_ids([0])
+    with self.test_session():
+      self.assertEqual(segment_ids.eval().tolist(), [])
+
+  def testErrors(self):
+    self.assertRaisesRegexp(ValueError, r'Invalid row_splits: \[\]',
+                            ragged.row_splits_to_segment_ids, [])
+    self.assertRaisesRegexp(
+        ValueError, r'Tensor conversion requested dtype int64 for '
+        'Tensor with dtype float32', ragged.row_splits_to_segment_ids,
+        constant_op.constant([0.5]))
+    self.assertRaisesRegexp(ValueError, r'Shape \(\) must have rank 1',
+                            ragged.row_splits_to_segment_ids, 0)
+    self.assertRaisesRegexp(ValueError, r'Shape \(1, 1\) must have rank 1',
+                            ragged.row_splits_to_segment_ids, [[0]])
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/python/ops/ragged/ragged_segment_ids_to_row_splits_op_test.py b/tensorflow/python/ops/ragged/ragged_segment_ids_to_row_splits_op_test.py
new file mode 100644
index 00000000000..fa7adf66b0b
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_segment_ids_to_row_splits_op_test.py
@@ -0,0 +1,74 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the ragged.segment_ids_to_row_splits() op."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import ragged
+from tensorflow.python.platform import googletest
+
+
+class RaggedSplitsToSegmentIdsOpTest(test_util.TensorFlowTestCase):
+
+  def testDocStringExample(self):
+    segment_ids = [0, 0, 0, 2, 2, 3, 4, 4, 4]
+    expected = [0, 3, 3, 5, 6, 9]
+    splits = ragged.segment_ids_to_row_splits(segment_ids)
+    with self.test_session():
+      self.assertEqual(splits.eval().tolist(), expected)
+
+  def testEmptySegmentIds(self):
+    # Note: the splits for an empty ragged tensor contains a single zero.
+    segment_ids = ragged.segment_ids_to_row_splits([])
+    with self.test_session():
+      self.assertEqual(segment_ids.eval().tolist(), [0])
+
+  def testErrors(self):
+    self.assertRaisesRegexp(TypeError,
+                            r'segment_ids must be an integer tensor.*',
+                            ragged.segment_ids_to_row_splits,
+                            constant_op.constant([0.5]))
+    self.assertRaisesRegexp(ValueError, r'Shape \(\) must have rank 1',
+                            ragged.segment_ids_to_row_splits, 0)
+    self.assertRaisesRegexp(ValueError, r'Shape \(1, 1\) must have rank 1',
+                            ragged.segment_ids_to_row_splits, [[0]])
+
+  def testNumSegments(self):
+    segment_ids = [0, 0, 0, 2, 2, 3, 4, 4, 4]
+    num_segments = 7
+    expected = [0, 3, 3, 5, 6, 9, 9, 9]
+    splits = ragged.segment_ids_to_row_splits(segment_ids, num_segments)
+    with self.test_session():
+      self.assertEqual(splits.eval().tolist(), expected)
+
+  def testUnsortedSegmentIds(self):
+    # Segment ids are not required to be sorted.
+    segment_ids = [0, 4, 3, 2, 4, 4, 2, 0, 0]
+    splits1 = ragged.segment_ids_to_row_splits(segment_ids)
+    expected1 = [0, 3, 3, 5, 6, 9]
+
+    splits2 = ragged.segment_ids_to_row_splits(segment_ids, 7)
+    expected2 = [0, 3, 3, 5, 6, 9, 9, 9]
+    with self.test_session():
+      self.assertEqual(splits1.eval().tolist(), expected1)
+      self.assertEqual(splits2.eval().tolist(), expected2)
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/python/ops/ragged/ragged_segment_op_test.py b/tensorflow/python/ops/ragged/ragged_segment_op_test.py
new file mode 100644
index 00000000000..7d41eb7f753
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_segment_op_test.py
@@ -0,0 +1,236 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for ragged_range op."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math
+from absl.testing import parameterized
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import ragged
+from tensorflow.python.platform import googletest
+
+
+def prod(values):
+  val = 1
+  for v in values:
+    val *= v
+  return val
+  # return reduce(lambda x, y: x * y, values, 1)
+
+
+def mean(values):
+  return 1.0 * sum(values) / len(values)
+
+
+def sqrt_n(values):
+  return 1.0 * sum(values) / math.sqrt(len(values))
+
+
+class RaggedSegmentOpsTest(test_util.TensorFlowTestCase,
+                           parameterized.TestCase):
+
+  def assertNestedListAmostEqual(self, lhs, rhs, places=7, context='value'):
+    self.assertEqual(type(lhs), type(rhs))
+    if isinstance(lhs, (list, tuple)):
+      self.assertEqual(len(lhs), len(rhs), 'Length differs for %s' % context)
+      for i in range(len(lhs)):
+        self.assertNestedListAmostEqual(lhs[i], rhs[i], places,
+                                        '%s[%s]' % (context, i))
+    else:
+      self.assertAlmostEqual(
+          lhs, rhs, places,
+          '%s != %s within %s places at %s' % (lhs, rhs, places, context))
+
+  def expected_value(self, data, segment_ids, num_segments, combiner):
+    """Find the expected value for a call to ragged_segment_<aggregate>.
+
+    Args:
+      data: The input RaggedTensor, expressed as a nested python list.
+      segment_ids: The segment ids, as a python list of ints.
+      num_segments: The number of segments, as a python int.
+      combiner: The Python function used to combine values.
+    Returns:
+      The expected value, as a nested Python list.
+    """
+    self.assertEqual(len(data), len(segment_ids))
+
+    # Build an empty (num_segments x ncols) "grouped" matrix
+    ncols = max(len(row) for row in data)
+    grouped = [[[] for _ in range(ncols)] for row in range(num_segments)]
+
+    # Append values from data[row] to grouped[segment_ids[row]]
+    for row in range(len(data)):
+      for col in range(len(data[row])):
+        grouped[segment_ids[row]][col].append(data[row][col])
+
+    # Combine the values.
+    return [[combiner(values)
+             for values in grouped_row
+             if values]
+            for grouped_row in grouped]
+
+  @parameterized.parameters(
+      (ragged.segment_sum, sum, [0, 0, 1, 1, 2, 2]),
+      (ragged.segment_sum, sum, [0, 0, 0, 1, 1, 1]),
+      (ragged.segment_sum, sum, [5, 4, 3, 2, 1, 0]),
+      (ragged.segment_sum, sum, [0, 0, 0, 10, 10, 10]),
+      (ragged.segment_prod, prod, [0, 0, 1, 1, 2, 2]),
+      (ragged.segment_prod, prod, [0, 0, 0, 1, 1, 1]),
+      (ragged.segment_prod, prod, [5, 4, 3, 2, 1, 0]),
+      (ragged.segment_prod, prod, [0, 0, 0, 10, 10, 10]),
+      (ragged.segment_min, min, [0, 0, 1, 1, 2, 2]),
+      (ragged.segment_min, min, [0, 0, 0, 1, 1, 1]),
+      (ragged.segment_min, min, [5, 4, 3, 2, 1, 0]),
+      (ragged.segment_min, min, [0, 0, 0, 10, 10, 10]),
+      (ragged.segment_max, max, [0, 0, 1, 1, 2, 2]),
+      (ragged.segment_max, max, [0, 0, 0, 1, 1, 1]),
+      (ragged.segment_max, max, [5, 4, 3, 2, 1, 0]),
+      (ragged.segment_max, max, [0, 0, 0, 10, 10, 10]),
+      (ragged.segment_mean, mean, [0, 0, 1, 1, 2, 2]),
+      (ragged.segment_mean, mean, [0, 0, 0, 1, 1, 1]),
+      (ragged.segment_mean, mean, [5, 4, 3, 2, 1, 0]),
+      (ragged.segment_mean, mean, [0, 0, 0, 10, 10, 10]),
+  )
+  def testRaggedSegment_Int(self, segment_op, combiner, segment_ids):
+    rt_as_list = [[0, 1, 2, 3], [4], [], [5, 6], [7], [8, 9]]
+    rt = ragged.constant(rt_as_list)
+    num_segments = max(segment_ids) + 1
+    expected = self.expected_value(rt_as_list, segment_ids, num_segments,
+                                   combiner)
+
+    segmented = segment_op(rt, segment_ids, num_segments)
+    with self.test_session():
+      self.assertListEqual(segmented.eval().tolist(), expected)
+
+  @parameterized.parameters(
+      (ragged.segment_sum, sum, [0, 0, 1, 1, 2, 2]),
+      (ragged.segment_sum, sum, [0, 0, 0, 1, 1, 1]),
+      (ragged.segment_sum, sum, [5, 4, 3, 2, 1, 0]),
+      (ragged.segment_sum, sum, [0, 0, 0, 10, 10, 10]),
+      (ragged.segment_prod, prod, [0, 0, 1, 1, 2, 2]),
+      (ragged.segment_prod, prod, [0, 0, 0, 1, 1, 1]),
+      (ragged.segment_prod, prod, [5, 4, 3, 2, 1, 0]),
+      (ragged.segment_prod, prod, [0, 0, 0, 10, 10, 10]),
+      (ragged.segment_min, min, [0, 0, 1, 1, 2, 2]),
+      (ragged.segment_min, min, [0, 0, 0, 1, 1, 1]),
+      (ragged.segment_min, min, [5, 4, 3, 2, 1, 0]),
+      (ragged.segment_min, min, [0, 0, 0, 10, 10, 10]),
+      (ragged.segment_max, max, [0, 0, 1, 1, 2, 2]),
+      (ragged.segment_max, max, [0, 0, 0, 1, 1, 1]),
+      (ragged.segment_max, max, [5, 4, 3, 2, 1, 0]),
+      (ragged.segment_max, max, [0, 0, 0, 10, 10, 10]),
+      (ragged.segment_mean, mean, [0, 0, 1, 1, 2, 2]),
+      (ragged.segment_mean, mean, [0, 0, 0, 1, 1, 1]),
+      (ragged.segment_mean, mean, [5, 4, 3, 2, 1, 0]),
+      (ragged.segment_mean, mean, [0, 0, 0, 10, 10, 10]),
+      (ragged.segment_sqrt_n, sqrt_n, [0, 0, 1, 1, 2, 2]),
+      (ragged.segment_sqrt_n, sqrt_n, [0, 0, 0, 1, 1, 1]),
+      (ragged.segment_sqrt_n, sqrt_n, [5, 4, 3, 2, 1, 0]),
+      (ragged.segment_sqrt_n, sqrt_n, [0, 0, 0, 10, 10, 10]),
+  )
+  def testRaggedSegment_Float(self, segment_op, combiner, segment_ids):
+    rt_as_list = [[0., 1., 2., 3.], [4.], [], [5., 6.], [7.], [8., 9.]]
+    rt = ragged.constant(rt_as_list)
+    num_segments = max(segment_ids) + 1
+    expected = self.expected_value(rt_as_list, segment_ids, num_segments,
+                                   combiner)
+
+    segmented = segment_op(rt, segment_ids, num_segments)
+    with self.test_session():
+      self.assertNestedListAmostEqual(
+          segmented.eval().tolist(), expected, places=5)
+
+  def testRaggedRankTwo(self):
+    rt = ragged.constant([
+        [[111, 112, 113, 114], [121],],  # row 0
+        [],                              # row 1
+        [[], [321, 322], [331]],         # row 2
+        [[411, 412]]                     # row 3
+    ])  # pyformat: disable
+    segment_ids1 = [0, 2, 2, 2]
+    segmented1 = ragged.segment_sum(rt, segment_ids1, 3)
+    expected1 = [[[111, 112, 113, 114], [121]],     # row 0
+                 [],                                # row 1
+                 [[411, 412], [321, 322], [331]]    # row 2
+                ]  # pyformat: disable
+    with self.test_session():
+      self.assertEqual(segmented1.eval().tolist(), expected1)
+
+    segment_ids2 = [1, 2, 1, 1]
+    segmented2 = ragged.segment_sum(rt, segment_ids2, 3)
+    expected2 = [[],
+                 [[111+411, 112+412, 113, 114], [121+321, 322], [331]],
+                 []]  # pyformat: disable
+    with self.test_session():
+      self.assertEqual(segmented2.eval().tolist(), expected2)
+
+  def testRaggedSegmentIds(self):
+    rt = ragged.constant([
+        [[111, 112, 113, 114], [121],],  # row 0
+        [],                              # row 1
+        [[], [321, 322], [331]],         # row 2
+        [[411, 412]]                     # row 3
+    ])  # pyformat: disable
+    segment_ids = ragged.constant([[1, 2], [], [1, 1, 2], [2]])
+    segmented = ragged.segment_sum(rt, segment_ids, 3)
+    expected = [[],
+                [111+321, 112+322, 113, 114],
+                [121+331+411, 412]]  # pyformat: disable
+    with self.test_session():
+      self.assertEqual(segmented.eval().tolist(), expected)
+
+  def testShapeMismatchError1(self):
+    dt = constant_op.constant([1, 2, 3, 4, 5, 6])
+    segment_ids = ragged.constant([[1, 2], []])
+    self.assertRaisesRegexp(
+        ValueError, 'segment_ids.shape must be a prefix of data.shape, '
+        'but segment_ids is ragged and data is not.', ragged.segment_sum, dt,
+        segment_ids, 3)
+
+  def testShapeMismatchError2(self):
+    rt = ragged.constant([
+        [[111, 112, 113, 114], [121]],  # row 0
+        [],                             # row 1
+        [[], [321, 322], [331]],        # row 2
+        [[411, 412]]                    # row 3
+    ])  # pyformat: disable
+    segment_ids = ragged.constant([[1, 2], [1], [1, 1, 2], [2]])
+
+    # Error is raised at graph-building time if we can detect it then.
+    self.assertRaisesRegexp(
+        errors.InvalidArgumentError,
+        'segment_ids.shape must be a prefix of data.shape.*',
+        ragged.segment_sum, rt, segment_ids, 3)
+
+    # Otherwise, error is raised when we run the graph.
+    segment_ids2 = ragged.from_row_splits(
+        array_ops.placeholder_with_default(segment_ids.values, None),
+        array_ops.placeholder_with_default(segment_ids.row_splits, None))
+    segmented2 = ragged.segment_sum(rt, segment_ids2, 3)
+    with self.test_session():
+      self.assertRaisesRegexp(
+          errors.InvalidArgumentError,
+          'segment_ids.shape must be a prefix of data.shape.*', segmented2.eval)
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/python/ops/ragged/ragged_stack_op_test.py b/tensorflow/python/ops/ragged/ragged_stack_op_test.py
new file mode 100644
index 00000000000..d474a749f04
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_stack_op_test.py
@@ -0,0 +1,330 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for ragged.stack."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import ragged
+from tensorflow.python.platform import googletest
+
+
+class RaggedStackOpTest(test_util.TensorFlowTestCase, parameterized.TestCase):
+
+  @parameterized.parameters(
+      dict(
+          descr='Two rank-2 inputs (ragged_rank=1), axis=0',
+          rt_inputs=(
+              [['a00', 'a01'], [], ['a20', 'a21']],   # shape=(3, None)
+              [['b00'], ['b10']]),                    # shape=(2, None)
+          axis=0,
+          expected=[[[b'a00', b'a01'], [], [b'a20', b'a21']], [[b'b00'],
+                                                               [b'b10']]]),
+      dict(
+          descr='Two rank-2 inputs (ragged_rank=1), axis=1',
+          rt_inputs=(
+              [['a00', 'a01'], [], ['a20', 'a21', 'a22']],   # shape=(3, None)
+              [['b00'], ['b10', 'b11', 'b12'], ['b20']]),    # shape=(3, None)
+          axis=1,
+          expected=[
+              [[b'a00', b'a01'], [b'b00']],
+              [[], [b'b10', b'b11', b'b12']],
+              [[b'a20', b'a21', b'a22'], [b'b20']]]),
+      dict(
+          descr='Two rank-2 inputs (ragged_rank=1), axis=2',
+          rt_inputs=(
+              [['a00', 'a01'], [], ['a20', 'a21', 'a22']],   # shape=(3, None)
+              [['b00', 'b01'], [], ['b20', 'b21', 'b22']]),  # shape=(3, None)
+          axis=2,
+          expected=[
+              [[b'a00', b'b00'], [b'a01', b'b01']], [],
+              [[b'a20', b'b20'], [b'a21', b'b21'], [b'a22', b'b22']]]),
+      dict(
+          descr='Two rank-2 inputs (ragged_rank=1), axis=-3',
+          rt_inputs=(
+              [['a00', 'a01'], [], ['a20', 'a21']],   # shape=(3, None)
+              [['b00'], ['b10']]),                    # shape=(2, None)
+          axis=-3,
+          expected=[[[b'a00', b'a01'], [], [b'a20', b'a21']], [[b'b00'],
+                                                               [b'b10']]]),
+      dict(
+          descr='Two rank-2 inputs (ragged_rank=1), axis=-2',
+          rt_inputs=(
+              [['a00', 'a01'], [], ['a20', 'a21', 'a22']],   # shape=(3, None)
+              [['b00'], ['b10', 'b11', 'b12'], ['b20']]),    # shape=(3, None)
+          axis=-2,
+          expected=[
+              [[b'a00', b'a01'], [b'b00']],
+              [[], [b'b10', b'b11', b'b12']],
+              [[b'a20', b'a21', b'a22'], [b'b20']]]),
+      dict(
+          descr='Two rank-2 inputs (ragged_rank=1), axis=-1',
+          rt_inputs=(
+              [['a00', 'a01'], [], ['a20', 'a21', 'a22']],   # shape=(3, None)
+              [['b00', 'b01'], [], ['b20', 'b21', 'b22']]),  # shape=(3, None)
+          axis=-1,
+          expected=[
+              [[b'a00', b'b00'], [b'a01', b'b01']], [],
+              [[b'a20', b'b20'], [b'a21', b'b21'], [b'a22', b'b22']]]),
+      dict(
+          descr='Three rank-2 inputs (ragged_rank=1), axis=0',
+          rt_inputs=(
+              [['a00', 'a01'], [], ['a20', 'a21', 'a22']],   # shape=(3, None)
+              [['b00'], ['b10']],                            # shape=(2, None)
+              [['c00'], ['c10', 'c11'], ['c21']]),           # shape=(3, None)
+          axis=0,
+          expected=[[[b'a00', b'a01'], [], [b'a20', b'a21', b'a22']],
+                    [[b'b00'], [b'b10']],
+                    [[b'c00'], [b'c10', b'c11'], [b'c21']]]),
+      dict(
+          descr='Three rank-2 inputs (ragged_rank=1), axis=1',
+          rt_inputs=(
+              [['a00', 'a01'], [], ['a20', 'a21', 'a22']],   # shape=(3, None)
+              [['b00'], ['b10', 'b11', 'b12'], ['b20']],     # shape=(3, None)
+              [[], ['c10', 'c11'], ['c20', 'c21']]),         # shape=(3, None)
+          axis=1,
+          expected=[
+              [[b'a00', b'a01'], [b'b00'], []],
+              [[], [b'b10', b'b11', b'b12'], [b'c10', b'c11']],
+              [[b'a20', b'a21', b'a22'], [b'b20'], [b'c20', b'c21']]],
+          expected_shape=[3, None, None]),
+      dict(
+          descr='Three rank-2 inputs (ragged_rank=1), axis=2',
+          rt_inputs=(
+              [['a00', 'a01'], [], ['a20', 'a21', 'a22']],   # shape=(3, None)
+              [['b00', 'b01'], [], ['b20', 'b21', 'b22']],   # shape=(3, None)
+              [['c00', 'c01'], [], ['c20', 'c21', 'c22']]),  # shape=(3, None)
+          axis=2,
+          expected=[
+              [[b'a00', b'b00', b'c00'], [b'a01', b'b01', b'c01']], [],
+              [[b'a20', b'b20', b'c20'], [b'a21', b'b21', b'c21'],
+               [b'a22', b'b22', b'c22']]]),
+      dict(
+          descr='Three rank-3 inputs (ragged_rank=2), axis=0',
+          rt_inputs=(
+              [[['a000', 'a001'], ['a010']],
+               [['a100', 'a101', 'a102'], ['a110', 'a111']]],
+              [[['b000']], [['b100', 'b101'], ['b110']]],
+              [[], [['c100', 'c101', 'c102', 'c103']], [[], ['c210', 'c211']]]),
+          axis=0,
+          expected=[
+              [[[b'a000', b'a001'], [b'a010']],
+               [[b'a100', b'a101', b'a102'], [b'a110', b'a111']]],
+              [[[b'b000']],
+               [[b'b100', b'b101'], [b'b110']]],
+              [[],
+               [[b'c100', b'c101', b'c102', b'c103']],
+               [[], [b'c210', b'c211']]]]),
+      dict(
+          descr='Three rank-3 inputs (ragged_rank=2), axis=1',
+          rt_inputs=(
+              [[['a000', 'a001'], ['a010']],
+               [['a100', 'a101', 'a102'], ['a110', 'a111']]],
+              [[['b000']], [['b100', 'b101'], ['b110']]],
+              [[], [[], ['c110', 'c111']]]),
+          axis=1,
+          expected=[
+              [[[b'a000', b'a001'], [b'a010']], [[b'b000']], []],
+              [[[b'a100', b'a101', b'a102'], [b'a110', b'a111']],
+               [[b'b100', b'b101'], [b'b110']],
+               [[], [b'c110', b'c111']]]]),
+      dict(
+          descr='Three rank-3 inputs (ragged_rank=2), axis=2',
+          rt_inputs=(
+              [[['a000', 'a001'], ['a010']],
+               [['a100', 'a101', 'a102'], ['a110', 'a111']]],
+              [[[], ['b010', 'b011']], [['b100', 'b101'], ['b110']]],
+              [[['c000'], ['c010']], [[], ['c110', 'c111']]]),
+          axis=2,
+          expected=[
+              [[[b'a000', b'a001'], [], [b'c000']],
+               [[b'a010'], [b'b010', b'b011'], [b'c010']]],
+              [[[b'a100', b'a101', b'a102'], [b'b100', b'b101'], []],
+               [[b'a110', b'a111'], [b'b110'], [b'c110', b'c111']]]]),
+      dict(
+          descr='Three rank-3 inputs (ragged_rank=2), axis=3',
+          rt_inputs=(
+              [[['a000', 'a001'], ['a010']]],
+              [[['b000', 'b001'], ['b010']]],
+              [[['c000', 'c001'], ['c010']]]),
+          axis=3,
+          expected=[[
+              [[b'a000', b'b000', b'c000'], [b'a001', b'b001', b'c001']],
+              [[b'a010', b'b010', b'c010']]]]),
+      dict(
+          descr='Three rank-3 inputs (ragged_rank=2), axis=-2',
+          rt_inputs=(
+              [[['a000', 'a001'], ['a010']],
+               [['a100', 'a101', 'a102'], ['a110', 'a111']]],
+              [[[], ['b010', 'b011']], [['b100', 'b101'], ['b110']]],
+              [[['c000'], ['c010']], [[], ['c110', 'c111']]]),
+          axis=-2,
+          expected=[
+              [[[b'a000', b'a001'], [], [b'c000']],
+               [[b'a010'], [b'b010', b'b011'], [b'c010']]],
+              [[[b'a100', b'a101', b'a102'], [b'b100', b'b101'], []],
+               [[b'a110', b'a111'], [b'b110'], [b'c110', b'c111']]]]),
+      dict(
+          descr='Three rank-3 inputs (ragged_rank=2), axis=-1',
+          rt_inputs=(
+              [[['a000', 'a001'], ['a010']]],
+              [[['b000', 'b001'], ['b010']]],
+              [[['c000', 'c001'], ['c010']]]),
+          axis=-1,
+          expected=[[
+              [[b'a000', b'b000', b'c000'], [b'a001', b'b001', b'c001']],
+              [[b'a010', b'b010', b'c010']]]]),
+      dict(
+          descr='ragged_stack([uniform, ragged, uniform], axis=1)',
+          ragged_ranks=[0, 1, 0],
+          rt_inputs=(
+              [['0('], ['1('], ['2(']],                   # shape=(3, 1)
+              [['b00'], ['b10', 'b11', 'b12'], ['b20']],  # shape=(3, None)
+              [[')0'], [')1'], [')2']]),                  # shape=(3, 1)
+          axis=1,
+          expected=[
+              [[b'0('], [b'b00'], [b')0']],
+              [[b'1('], [b'b10', b'b11', b'b12'], [b')1']],
+              [[b'2('], [b'b20'], [b')2']]]),
+      dict(
+          descr='ragged_stack([uniform, uniform], axis=0)',
+          ragged_ranks=[0, 0],
+          rt_inputs=(
+              [['a00', 'a01'], ['a10', 'a11'], ['a20', 'a21']],  # shape=(3, 2)
+              [['b00', 'b01', 'b02'], ['b10', 'b11', 'b12']]),   # shape=(2, 3)
+          axis=0,
+          expected=[
+              [[b'a00', b'a01'], [b'a10', b'a11'], [b'a20', b'a21']],
+              [[b'b00', b'b01', b'b02'], [b'b10', b'b11', b'b12']]]),
+      dict(
+          descr='ragged_stack([uniform, ragged], axis=0)',
+          ragged_ranks=[0, 1],
+          rt_inputs=(
+              [['a00', 'a01'], ['a10', 'a11'], ['a20', 'a21']],  # shape=(3, 2)
+              [['b00', 'b01', 'b02'], ['b10', 'b11', 'b12']]),   # shape=(2, 3)
+          axis=0,
+          expected=[
+              [[b'a00', b'a01'], [b'a10', b'a11'], [b'a20', b'a21']],
+              [[b'b00', b'b01', b'b02'], [b'b10', b'b11', b'b12']]]),
+      dict(
+          descr='ragged_stack([uniform, ragged], axis=0) with rank-3 inputs',
+          ragged_ranks=[0, 2],
+          rt_inputs=(
+              [[[0, 1], [2, 3]], [[4, 5], [6, 7]]],  # shape = (2, 2, 2)
+              [[[8], [8, 8]]]),                      # shape = (2, None, None)
+          axis=0,
+          expected=[[[[0, 1], [2, 3]], [[4, 5], [6, 7]]], [[[8], [8, 8]]]]),
+      dict(
+          descr='Two rank-3 inputs with ragged_rank=1, axis=-1',
+          ragged_ranks=[1, 1],
+          rt_inputs=(
+              [[[0, 1], [2, 3], [4, 5]], [], [[6, 7], [8, 9]]],
+              [[[9, 8], [7, 6], [5, 4]], [], [[3, 2], [1, 0]]]),
+          axis=-1,
+          expected=[
+              [[[0, 9], [1, 8]], [[2, 7], [3, 6]], [[4, 5], [5, 4]]],
+              [],
+              [[[6, 3], [7, 2]], [[8, 1], [9, 0]]]],
+          expected_shape=[3, None, 2, 2]),
+      dict(
+          descr='Two rank-3 inputs with ragged_rank=1, axis=-2',
+          ragged_ranks=[1, 1],
+          rt_inputs=(
+              [[[0, 1], [2, 3], [4, 5]], [], [[6, 7], [8, 9]]],
+              [[[9, 8], [7, 6], [5, 4]], [], [[3, 2], [1, 0]]]),
+          axis=-2,
+          expected=[
+              [[[0, 1], [9, 8]], [[2, 3], [7, 6]], [[4, 5], [5, 4]]], [],
+              [[[6, 7], [3, 2]], [[8, 9], [1, 0]]]]),
+      dict(
+          descr='ragged_stack([vector, vector], axis=0)',
+          ragged_ranks=[0, 0],
+          rt_inputs=([1, 2, 3], [4, 5, 6]),
+          axis=0,
+          expected=[[1, 2, 3], [4, 5, 6]]),
+      dict(
+          descr='One input (so just adds an outer dimension)',
+          rt_inputs=([['a00', 'a01'], [], ['a20', 'a21']],),
+          axis=0,
+          expected=[[[b'a00', b'a01'], [], [b'a20', b'a21']]]),
+  )   # pyformat: disable
+  def testRaggedStack(self,
+                      descr,
+                      rt_inputs,
+                      axis,
+                      expected,
+                      ragged_ranks=None,
+                      expected_ragged_rank=None,
+                      expected_shape=None):
+    if ragged_ranks is None:
+      ragged_ranks = [None] * len(rt_inputs)
+    rt_inputs = [
+        ragged.constant(rt_input, ragged_rank=rrank)
+        if rrank != 0 else constant_op.constant(rt_input)
+        for (rt_input, rrank) in zip(rt_inputs, ragged_ranks)
+    ]
+    stacked = ragged.stack(rt_inputs, axis)
+    if expected_ragged_rank is not None:
+      self.assertEqual(stacked.ragged_rank, expected_ragged_rank)
+    if expected_shape is not None:
+      self.assertEqual(stacked.shape.as_list(), expected_shape)
+    with self.test_session():
+      self.assertEqual(stacked.eval().tolist(), expected)
+
+  @parameterized.parameters(
+      dict(
+          rt_inputs=(),
+          axis=0,
+          error=ValueError,
+          message=r'rt_inputs may not be empty\.'),
+      dict(
+          rt_inputs=([[1, 2]], [[3, 4]]),
+          axis=r'foo',
+          error=TypeError,
+          message='axis must be an int'),
+      dict(
+          rt_inputs=([[1, 2]], [[3, 4]]),
+          axis=-4,
+          error=ValueError,
+          message='axis=-4 out of bounds: expected -3<=axis<3'),
+      dict(
+          rt_inputs=([[1, 2]], [[3, 4]]),
+          axis=3,
+          error=ValueError,
+          message='axis=3 out of bounds: expected -3<=axis<3'),
+  )
+  def testError(self, rt_inputs, axis, error, message):
+    self.assertRaisesRegexp(error, message, ragged.stack, rt_inputs, axis)
+
+  def testSingleTensorInput(self):
+    """Tests ragged_stack with a single tensor input.
+
+    Usually, we pass a list of values in for rt_inputs.  However, you can
+    also pass in a single value (as with tf.stack), in which case it is
+    equivalent to expand_dims(axis=0).  This test exercises that path.
+    """
+    rt_inputs = ragged.constant([[1, 2], [3, 4]])
+    stacked = ragged.stack(rt_inputs, 0)
+    with self.test_session():
+      self.assertEqual(stacked.eval().tolist(), [[[1, 2], [3, 4]]])
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/python/ops/ragged/ragged_tensor.py b/tensorflow/python/ops/ragged/ragged_tensor.py
new file mode 100644
index 00000000000..abb27fc3c08
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_tensor.py
@@ -0,0 +1,664 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Classes for storing ragged tensors and their values."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.client import session
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops.ragged import ragged_tensor_value
+
+# pylint: disable=protected-access
+_eval_using_default_session = ops._eval_using_default_session
+
+# pylint: enable=protected-access
+
+#===============================================================================
+# RaggedTensor
+#===============================================================================
+
+
+class RaggedTensor(object):
+  """Represents a ragged tensor (go/ragged).
+
+  A `RaggedTensor` is a tensor with one or more *ragged dimensions*, which are
+  dimensions whose slices may have different lengths.  For example, the inner
+  (column) dimension of `rt=[[3, 1, 4, 1], [], [5, 9, 2], [6], []]` is ragged,
+  since the column slices (`rt[0, :]`, ..., `rt[4, :]`) have different lengths.
+  Dimensions whose slices all have the same length are called *uniform
+  dimensions*.  The outermost dimension of a `RaggedTensor` is always uniform,
+  since it consists of a single slice (and so there is no possibility for
+  differing slice lengths).
+
+  The total number of dimensions in a `RaggedTensor` is called its *rank*,
+  and the number of ragged dimensions in a `RaggedTensor` is called its
+  *ragged-rank*.  A `RaggedTensor`'s ragged-rank is fixed at graph creation
+  time: it can't depend on the runtime values of `Tensor`s, and can't vary
+  dynamically for different session runs.
+
+  ### Potentially Ragged Tensors
+
+  Many ops support both `Tensor`s and `RaggedTensor`s.  The term "potentially
+  ragged tensor" may be used to refer to a tensor that might be either a
+  `Tensor` or a `RaggedTensor`.  The ragged-rank of a `Tensor` is zero.
+
+  ### Documenting RaggedTensor Shapes
+
+  When documenting the shape of a RaggedTensor, ragged dimensions can be
+  indicated by enclosing them in parentheses.  For example, the shape of
+  a 3-D `RaggedTensor` that stores the fixed-size word embedding for each
+  word in a sentence, for each sentence in a batch, could be written as
+  `[num_sentences, (num_words), embedding_size]`.  The parentheses around
+  `(num_words)` indicate that that dimension is ragged, and that the length
+  of each element list in that dimension may vary for each item.
+
+  ### Component Tensors
+
+  Internally, a `RaggedTensor` consists of a concatenated list of values that
+  are partitioned into variable-length rows.  In particular, each `RaggedTensor`
+  consists of:
+
+    * A `values` tensor, which concatenates the variable-length rows into a
+      flattened list.  For example, the `values` tensor for
+      `[[3, 1, 4, 1], [], [5, 9, 2], [6], []]` is `[3, 1, 4, 1, 5, 9, 2, 6]`.
+
+    * A `row_splits` vector, which indicates how those flattened values are
+      divided into rows.  In particular, the values for row `rt[i]` are stored
+      in the slice `rt.values[rt.row_splits[i]:rt.row_splits[i+1]]`.
+
+  Example:
+
+  ```python
+  >>> rt = ragged.from_row_splits(values=[3, 1, 4, 1, 5, 9, 2, 6],
+  ...                             row_splits=[0, 4, 4, 7, 8, 8])
+  >>> rt.tolist()
+  [[3, 1, 4, 1], [], [5, 9, 2], [6], []]
+  ```
+
+  ### Alternative Row-Partitioning Schemes
+
+  In addition to `row_splits`, ragged tensors provide support for four other
+  row-partitioning schemes:
+
+    * `row_lengths`: a vector with shape `[nrows]`, which specifies the length
+      of each row.
+
+    * `value_rowids` and `nrows`: `value_rowids` is a vector with shape
+      `[nvals]`, corresponding one-to-one with `values`, which specifies
+      each value's row index.  In particular, the row `rt[row]` consists of the
+      values `rt.values[j]` where `value_rowids[j]==row`.  `nrows` is an
+      int64 scalar that specifies the number of rows in the `RaggedTensor`.
+      (`nrows` is used to indicate trailing empty rows.)
+
+    * `row_starts`: a vector with shape `[nrows]`, which specifies the start
+      offset of each row.  Equivalent to `row_splits[:-1]`.
+
+    * `row_limits`: a vector with shape `[nrows]`, which specifies the stop
+      offset of each row.  Equivalent to `row_splits[1:]`.
+
+  Example: The following ragged tensors are equivalent, and all represent the
+  nested list `[[3, 1, 4, 1], [], [5, 9, 2], [6], []]`.
+
+  ```python
+  >>> values = [3, 1, 4, 1, 5, 9, 2, 6]
+  >>> rt1 = ragged.from_row_splits(values, row_splits=[0, 4, 4, 7, 8, 8])
+  >>> rt2 = ragged.from_row_lengths(values, row_lengths=[4, 0, 3, 1, 0])
+  >>> rt3 = ragged.from_value_rowids(values,
+  ...                                value_rowids=[0, 0, 0, 0, 2, 2, 2, 3],
+  ...                                nrows=5)
+  >>> rt4 = ragged.from_row_starts(values, row_starts=[0, 4, 4, 7, 8])
+  >>> rt5 = ragged.from_row_limits(values, row_limits=[4, 4, 7, 8, 8])
+  ```
+
+  ### Multiple Ragged Dimensions
+
+  `RaggedTensor`s with multiple ragged dimensions can be defined by using
+  a nested `RaggedTensor` for the `values` tensor.  Each nested `RaggedTensor`
+  adds a single ragged dimension.
+
+  ```python
+  >>> inner_rt = ragged.from_row_splits(  # =rt1 from above
+  ...     values=[3, 1, 4, 1, 5, 9, 2, 6], row_splits=[0, 4, 4, 7, 8, 8])
+  >>> outer_rt = ragged.from_row_splits(
+  ...     values=inner_rt, row_splits=[0, 3, 3, 5])
+  >>> print outer_rt.tolist()
+  [[[3, 1, 4, 1], [], [5, 9, 2]], [], [[6], []]]
+  >>> print outer_rt.ragged_rank
+  2
+  ```
+
+  The factory function `ragged.from_nested_row_splits` may be used to
+  construct a `RaggedTensor` with multiple ragged dimensions directly, by
+  providing a list of `row_splits` tensors:
+
+  ```python
+  >>> ragged.from_nested_row_splits(
+  ...     inner_values=[3, 1, 4, 1, 5, 9, 2, 6],
+  ...     nested_row_splits=([0, 3, 3, 5], [0, 4, 4, 7, 8, 8])).tolist()
+  [[[3, 1, 4, 1], [], [5, 9, 2]], [], [[6], []]]
+  ```
+
+  ### Uniform Inner Dimensions
+
+  `RaggedTensor`s with uniform inner dimensions can be defined
+  by using a multidimensional `Tensor` for `values`.
+
+  ```python
+  >>> rt = ragged.from_row_splits(values=tf.ones([5, 3]), row_splits=[0, 2, 5])
+  >>> print rt.tolist()
+  [[[1, 1, 1], [1, 1, 1]],
+   [[1, 1, 1], [1, 1, 1], [1, 1, 1]]]
+   >>> print rt.shape.as_list()
+   [2, None, 3]
+  ```
+
+  ### RaggedTensor Shape Restrictions
+
+  The shape of a RaggedTensor is currently restricted to have the following
+  form:
+
+    * A single uniform dimension
+    * Followed by one or more ragged dimensions
+    * Followed by zero or more uniform dimensions.
+
+  This restriction follows from the fact that each nested `RaggedTensor`
+  replaces the uniform outermost dimension of its `values` with a uniform
+  dimension followed by a ragged dimension.
+  """
+
+  #=============================================================================
+  # Implementation notes
+  #=============================================================================
+  # Currently, the RaggedTensor class uses a single row-partitioning scheme
+  # (row_splits).
+  #
+  # We are considering adding value_rowids+nvals as a secondary
+  # row-partitioning scheme.  This change would not impact the functional
+  # interface of the RaggedTensor class, but it would impact the efficiency
+  # of several operations.  In particular:
+  #
+  #   * The functions `ragged.value_rowids` and `ragged.nrows` would always
+  #     return pre-existing tensors; they would not need to add any ops to
+  #     the graph.
+  #
+  #   * The `RaggedTensor` constructor would construct all row-partitioning
+  #     tensors (row_splits, value_rowids, and nvals).  In eager mode, this
+  #     would mean that conversion operations would occur whenever a
+  #     `RaggedTensor` is constructed.  But in graph mode, the converted
+  #     row-partitioning tensors would only be evaluated if they are used.
+  #
+  # Since this change impacts efficiency but not functionality, we would like
+  # to perform additional profiling with real-world use cases before we
+  # decide whether to make this change.
+
+  #=============================================================================
+  # Constructor (private)
+  #=============================================================================
+  def __init__(self,
+               values,
+               row_splits,
+               cached_row_lengths=None,
+               cached_value_rowids=None,
+               cached_nrows=None,
+               internal=False):
+    """Creates a `RaggedTensor` with a specified partitioning for `values`.
+
+    This constructor is private -- please use one of the following ops to
+    build `RaggedTensor`s:
+
+      * [`ragged.from_row_lengths()`](from_row_lengths.md)
+      * [`ragged.from_value_rowids()`](from_value_rowids.md)
+      * [`ragged.from_row_splits()`](from_row_splits.md)
+      * [`ragged.from_row_starts()`](from_row_starts.md)
+      * [`ragged.from_row_limits()`](from_row_limits.md)
+      * [`ragged.from_nested_row_splits()`](from_nested_row_splits.md)
+      * [`ragged.from_nested_value_rowids()`](from_nested_value_rowids.md)
+
+    Args:
+      values: A potentially ragged tensor of any dtype and shape `[nvals, ...]`.
+      row_splits: A 1-D int64 tensor with shape `[nrows+1]`.
+      cached_row_lengths: A 1-D int64 tensor with shape `[nrows]`
+      cached_value_rowids: A 1-D int64 tensor with shape `[nvals]`.
+      cached_nrows: A 1-D int64 scalar tensor.
+      internal: True if the constructor is being called by one of the factory
+        methods.  If false, an exception will be raised.
+
+    Raises:
+      TypeError: If a row partitioning tensor has an inappropriate dtype.
+      TypeError: If exactly one row partitioning argument was not specified.
+      ValueError: If a row partitioning tensor has an inappropriate shape.
+      ValueError: If multiple partitioning arguments are specified.
+      ValueError: If nrows is specified but value_rowids is not None.
+    """
+    if not internal:
+      raise ValueError("RaggedTensor constructor is private; please use one "
+                       "of the factory methods instead (e.g., "
+                       "ragged.from_row_lengths())")
+
+    # Validate the arguments.
+    if not isinstance(values, (RaggedTensor, ops.Tensor)):
+      raise TypeError("values must be a Tensor or RaggedTensor.")
+    if not isinstance(row_splits, ops.Tensor):
+      raise TypeError("Row-partitioning argument must be a Tensor.")
+    values.shape.with_rank_at_least(1)
+    row_splits.shape.assert_has_rank(1)
+
+    self._values = values
+    self._row_splits = row_splits
+
+    # Store any cached tensors.  These are used to avoid unnecessary
+    # round-trip conversions when a RaggedTensor is constructed from
+    # lengths or rowids, and we later want those lengths/rowids back.
+    for tensor in [cached_row_lengths, cached_value_rowids, cached_nrows]:
+      if tensor is not None and not isinstance(tensor, ops.Tensor):
+        raise TypeError("Cached value must be a Tensor or None.")
+    self._cached_row_lengths = cached_row_lengths
+    self._cached_value_rowids = cached_value_rowids
+    self._cached_nrows = cached_nrows
+
+  #=============================================================================
+  # Accessors
+  #=============================================================================
+
+  @property
+  def dtype(self):
+    """The `DType` of values in this tensor."""
+    return self._values.dtype
+
+  @property
+  def shape(self):
+    """The statically known shape of this ragged tensor.
+
+    Returns:
+      A `TensorShape` containing the statically known shape of this ragged
+      tensor.  Ragged dimensions have a size of `None`.
+
+    Examples:
+
+      ```python
+      >>> ragged.constant([[0], [1, 2]]).shape
+      TensorShape([Dimension(2), Dimension(None)])
+
+      >>> ragged.constant([[[0, 1]], [[1, 2], [3, 4]]], ragged_rank=1).shape
+      TensorShape([Dimension(2), Dimension(None), Dimension(2)
+      ```
+    """
+    nrows = tensor_shape.dimension_at_index(self._row_splits.shape, 0) - 1
+
+    values_shape = self._values.shape
+    value_shape = values_shape[1:]
+    return tensor_shape.TensorShape([nrows, None]).concatenate(value_shape)
+
+  @property
+  def ragged_rank(self):
+    """The number of ragged dimensions in this ragged tensor.
+
+    Returns:
+      A Python `int` indicating the number of ragged dimensions in this ragged
+      tensor.  The outermost dimension is not considered ragged.
+    """
+    values_is_ragged = isinstance(self._values, RaggedTensor)
+    return self._values.ragged_rank + 1 if values_is_ragged else 1
+
+  @property
+  def values(self):
+    """The concatenated rows for this ragged tensor.
+
+    `rt.values` is a potentially ragged tensor formed by flattening the two
+    outermost dimensions of `rt` into a single dimension.
+
+    `rt.values.shape = [nvals] + rt.shape[2:]` (where `nvals` is the
+    number of items in the outer two dimensions of `rt`).
+
+    `rt.ragged_rank = self.ragged_rank - 1`
+
+    Returns:
+      A potentially ragged tensor.
+
+    #### Example:
+      ```python
+      >>> rt = ragged.constant([[3, 1, 4, 1], [], [5, 9, 2], [6], []])
+      >>> rt.values.eval()
+      [3, 1, 4, 1, 5, 9, 2, 6]
+      ```
+    """
+    return self._values
+
+  @property
+  def row_splits(self):
+    """The row-split indices for this ragged tensor's `values`.
+
+    `rt.row_splits` specifies where the values for each row begin and end in
+    `rt.values`.  In particular, the values for row `rt[i]` are stored in
+    the slice `rt.values[rt.row_splits[i]:rt.row_splits[i+1]]`.
+
+    Returns:
+      A 1-D `int64` `Tensor` with shape `[self.nrows+1]`.
+      The returned tensor is non-empty, and is sorted in ascending order.
+      `self.row_splits[0]` is zero, and `self.row_splits[-1]` is equal to
+      `self.values.shape[0]`.
+
+    #### Example:
+      ```python
+      >>> rt = ragged.constant([[3, 1, 4, 1], [], [5, 9, 2], [6], []])
+      >>> rt.values.eval()
+      [3, 1, 4, 1, 5, 9, 2, 6]
+      >>> rt.row_splits.eval()  # indices of row splits in ragged.values
+      [0, 4, 4, 7, 8, 8]
+      ```
+    """
+    return self._row_splits
+
+  @property
+  def inner_values(self):
+    """The innermost `values` tensor for this ragged tensor.
+
+    Concretely, if `rt.values` is a `Tensor`, then `rt.inner_values` is
+    `rt.values`; otherwise, `rt.inner_values` is `rt.values.inner_values`.
+
+    Conceptually, `inner_values` is the tensor formed by flattening the
+    outermost dimension and all of the ragged dimensions into a single
+    dimension.
+
+    `rt.inner_values.shape = [nvals] + rt.shape[rt.ragged_rank + 1:]`
+    (where `nvals` is the number of items in the flattened dimensions).
+
+    Returns:
+      A `Tensor`.
+
+    #### Example:
+
+      ```python
+      >>> rt = ragged.constant([[[3, 1, 4, 1], [], [5, 9, 2]], [], [[6], []]])
+      >>> ragged.inner_values(rt).eval()
+      [3, 1, 4, 1, 5, 9, 2, 6]
+      ```
+    """
+    rt_values = self.values
+    while isinstance(rt_values, RaggedTensor):
+      rt_values = rt_values.values
+    return rt_values
+
+  @property
+  def nested_row_splits(self):
+    """A tuple containing the row_splits for all ragged dimensions.
+
+    `rt.nested_row_splits` is a tuple containing the `row_splits` tensors for
+    all ragged dimensions in `rt`, ordered from outermost to innermost.  In
+    particular, `rt.nested_row_splits = (rt.row_splits,) + value_splits` where:
+
+        * `value_splits = ()` if `rt.values` is a `Tensor`.
+        * `value_splits = rt.values.nested_row_splits` otherwise.
+
+    Returns:
+      A `tuple` of 1-D `int64` `Tensor`s.
+
+    #### Example:
+
+      ```python
+      >>> rt = ragged.constant([[[[3, 1, 4, 1], [], [5, 9, 2]], [], [[6], []]]])
+      >>> for i, splits in enumerate(ragged.nested_row_splits(rt)):
+      ...   print('Splits for dimension %d: %s' % (i+1, splits.eval()))
+      Splits for dimension 1: [0, 1]
+      Splits for dimension 2: [0, 3, 3, 5]
+      Splits for dimension 3: [0, 4, 4, 7, 8, 8]
+      ```
+
+    """
+    rt_nested_splits = [self.row_splits]
+    rt_values = self.values
+    while isinstance(rt_values, RaggedTensor):
+      rt_nested_splits.append(rt_values.row_splits)
+      rt_values = rt_values.values
+    return tuple(rt_nested_splits)
+
+  @property
+  def cached_value_rowids(self):
+    """The row lengths for this `RaggedTensor`, or `None`.
+
+    Returns:
+      The `value_rowids` tensor that was used to construct this `RaggedTensor`
+      if it was constructed using
+      [`ragged.from_value_rowids`](from_value_rowids.md); or `None` otherwise.
+    """
+    return self._cached_value_rowids
+
+  @property
+  def cached_nrows(self):
+    """The row lengths for this `RaggedTensor`, or `None`.
+
+    Returns:
+      The `nrows` tensor that was used to construct this `RaggedTensor`
+      if it was constructed using
+      [`ragged.from_value_rowids`](from_value_rowids.md); or `None` otherwise.
+    """
+    return self._cached_nrows
+
+  @property
+  def cached_row_lengths(self):
+    """The row lengths for this `RaggedTensor`, or `None`.
+
+    Returns:
+      The `row_lengths` tensor that was used to construct this `RaggedTensor`
+      if it was constructed using
+      [`ragged.from_row_lengths`](from_row_lengths.md); or `None` otherwise.
+    """
+    return self._cached_row_lengths
+
+  #=============================================================================
+  # Transformation
+  #=============================================================================
+
+  def with_values(self, new_values):
+    """Returns a copy of `self` with `values` replaced by `new_value`.
+
+    Preserves cached row-partitioning tensors such as `self.cached_nrows` and
+    `self.cached_value_rowids` if they have values.
+
+    Args:
+      new_values: Potentially ragged tensor to use as the `values` for the
+        returned `RaggedTensor`.  Must have `rank > 0`, and must have the same
+        number of rows as `self.values`.
+
+    Returns:
+      A `RaggedTensor`.  `result.rank = 1 + new_values.rank`.
+      `result.ragged_rank = 1 + new_values.ragged_rank`
+    """
+    new_values.shape.with_rank_at_least(1)
+    self.values.shape[0].assert_is_compatible_with(new_values.shape[0])
+    return RaggedTensor(
+        new_values,
+        self._row_splits,
+        self._cached_row_lengths,
+        self._cached_value_rowids,
+        self._cached_nrows,
+        internal=True)
+
+  def with_inner_values(self, new_values):
+    """Returns a copy of `self` with `inner_values` replaced by `new_value`.
+
+    Preserves cached row-partitioning tensors such as `self.cached_nrows` and
+    `self.cached_value_rowids` if they have values.
+
+    Args:
+      new_values: Potentially ragged tensor that should replace
+      `self.inner_values`.  Must have `rank > 0`, and must have the same
+      number of rows as `self.inner_values`.
+
+    Returns:
+      A `RaggedTensor`.
+      `result.rank = self.ragged_rank + new_values.rank`.
+      `result.ragged_rank = self.ragged_rank + new_values.ragged_rank`.
+    """
+    if isinstance(self._values, ops.Tensor):
+      return self.with_values(new_values)
+    else:
+      return self.with_values(self.values.with_inner_values(new_values))
+
+  #=============================================================================
+  # String Encoding
+  #=============================================================================
+  def __str__(self):
+    if self._is_eager():
+      return "RaggedTensor(%s)" % self.tolist()
+    else:
+      return self.__repr__()
+
+  def __repr__(self):
+    return "RaggedTensor(values=%s, row_splits=%s)" % (self._values,
+                                                       self._row_splits)
+
+  #=============================================================================
+  # Eager Execution Mode
+  #=============================================================================
+
+  def tolist(self):
+    """Returns a nested Python `list` with the values for this `RaggedTensor`.
+
+    If a `RaggedTensor` `rt` was constructed in graph execution mode, then
+    `rt.tolist()` is equivalent to `rt.eval().tolist()`.
+
+    If a `RaggedTensor` `rt` was constructed in eager execution mode, then
+    `rt.tolist()` builds the Python list based on `rt`'s `EagerTensor`
+    components.
+
+    Returns:
+      A nested Python `list`.
+    """
+    if self._is_eager():
+      return self._eager_value().tolist()
+    else:
+      return self.eval().tolist()
+
+  def _eager_value(self):
+    """Returns a RaggedTensorValue for self.  Requires self._is_eager()=true."""
+    value = self.inner_values.numpy()
+    for row_splits in reversed(self.nested_row_splits):
+      value = ragged_tensor_value.RaggedTensorValue(value, row_splits.numpy())
+    return value
+
+  def _is_eager(self):
+    """Returns True if values & row_splits Tensors are all `EagerTensor`s."""
+    rt = self
+    while isinstance(rt, RaggedTensor):
+      if not isinstance(rt.row_splits, ops.EagerTensor):
+        return False
+      rt = rt.values
+    return isinstance(rt, ops.EagerTensor)
+
+  #=============================================================================
+  # Evaluation
+  #=============================================================================
+  def eval(self, feed_dict=None, session=None):  # pylint: disable=redefined-outer-name
+    """Evaluates this ragged tensor in a `Session`.
+
+    Args:
+      feed_dict: A dictionary that maps `Tensor` objects to feed values. See
+        `tf.Session.run` for a description of the valid feed values.
+      session: The `Session` to be used to evaluate this ragged tensor. If none,
+        the default session will be used.
+
+    Returns:
+      A `RaggedTensorValue` object.
+    """
+    return _eval_using_default_session(self, feed_dict,
+                                       self._as_graph_element().graph, session)
+
+  #=============================================================================
+  # Indexing & Slicing
+  #=============================================================================
+  def __getitem__(self, key):
+    """Returns the specified piece of this RaggedTensor."""
+    # See ragged_getitem.py for the documentation and implementation of this
+    # method.
+    #
+    # Note: the imports in ragged/__init__.py ensure that this method always
+    # gets overridden before it is called.
+
+  #=============================================================================
+  # Name Scope
+  #=============================================================================
+
+  # This private function is used by ops.name_scope to ensure that all of the
+  # input tensors for the scope belong to the same graph.  Defining this means
+  # that you may include `RaggedTensor` objects in the name_scope `values`
+  # list.
+  def _as_graph_element(self):
+    """Convert `self` to a graph element."""
+    values = self.values
+    while isinstance(values, RaggedTensor):
+      values = values.values
+    return values
+
+
+def is_ragged(value):
+  """Returns true if `value` is a ragged tensor or ragged tensor value."""
+  return isinstance(value,
+                    (RaggedTensor, ragged_tensor_value.RaggedTensorValue))
+
+
+#===============================================================================
+# Register RaggedTensor for use with session.run.
+#===============================================================================
+def _ragged_tensor_value_from_components(components):
+  components = list(components)
+  value = components.pop()
+  while components:
+    value = ragged_tensor_value.RaggedTensorValue(value, components.pop())
+  return value
+
+
+def _ragged_tensor_session_fetch(rt):
+  components = rt.nested_row_splits + (rt.inner_values,)
+  return (components, _ragged_tensor_value_from_components)
+
+
+def _ragged_tensor_session_feed(feed_key, feed_val):
+  key_components = feed_key.nested_row_splits + (feed_key.inner_values,)
+  val_components = feed_val.nested_row_splits + (feed_val.inner_values,)
+  return zip(key_components, val_components)
+
+
+def _ragged_tensor_session_feed_for_partial_run(feed_key):
+  return feed_key.nested_row_splits + (feed_key.inner_values,)
+
+
+session.register_session_run_conversion_functions(
+    RaggedTensor, _ragged_tensor_session_fetch, _ragged_tensor_session_feed,
+    _ragged_tensor_session_feed_for_partial_run)
+
+
+class RaggedTensorType(object):
+  """Encoding of a static type for a `RaggedTensor`.
+
+  Use this type to express/declare that an output must have the type of
+  `RaggedTensor`.
+  """
+
+  def __init__(self, dtype, ragged_rank):
+    """Initializes a RaggedTensorType object.
+
+    Args:
+      dtype: data type of the `RaggedTensor`'s inner values.
+      ragged_rank: ragged_rank of the declared `RaggedTensor`.
+    """
+    self._dtype = dtype
+    self._ragged_rank = ragged_rank
+
+  dtype = property(lambda self: self._dtype)
+  ragged_rank = property(lambda self: self._ragged_rank)
diff --git a/tensorflow/python/ops/ragged/ragged_tensor_bounding_shape_op_test.py b/tensorflow/python/ops/ragged/ragged_tensor_bounding_shape_op_test.py
new file mode 100644
index 00000000000..a1c10aff9de
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_tensor_bounding_shape_op_test.py
@@ -0,0 +1,69 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for ragged.bounding_shape."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import ragged
+from tensorflow.python.platform import googletest
+
+
+class RaggedTensorBoundingShapeOp(test_util.TensorFlowTestCase):
+
+  def testDocStringExample(self):
+    # This is the example from ragged.bounding_shape.__doc__.
+    rt = ragged.constant([[1, 2, 3, 4], [5], [], [6, 7, 8, 9], [10]])
+    with self.test_session():
+      self.assertEqual(ragged.bounding_shape(rt).eval().tolist(), [5, 4])
+
+  def test2DRaggedTensorWithOneRaggedDimension(self):
+    values = ['a', 'b', 'c', 'd', 'e', 'f', 'g']
+    rt1 = ragged.from_row_splits(values, [0, 2, 5, 6, 6, 7])
+    rt2 = ragged.from_row_splits(values, [0, 7])
+    rt3 = ragged.from_row_splits(values, [0, 0, 7, 7])
+    with self.test_session():
+      self.assertEqual(ragged.bounding_shape(rt1).eval().tolist(), [5, 3])
+      self.assertEqual(ragged.bounding_shape(rt2).eval().tolist(), [1, 7])
+      self.assertEqual(ragged.bounding_shape(rt3).eval().tolist(), [3, 7])
+
+  def test3DRaggedTensorWithOneRaggedDimension(self):
+    values = [[0, 1], [2, 3], [4, 5], [6, 7], [8, 9], [10, 11], [12, 13]]
+    rt1 = ragged.from_row_splits(values, [0, 2, 5, 6, 6, 7])
+    rt2 = ragged.from_row_splits(values, [0, 7])
+    rt3 = ragged.from_row_splits(values, [0, 0, 7, 7])
+    with self.test_session():
+      self.assertEqual(ragged.bounding_shape(rt1).eval().tolist(), [5, 3, 2])
+      self.assertEqual(ragged.bounding_shape(rt2).eval().tolist(), [1, 7, 2])
+      self.assertEqual(ragged.bounding_shape(rt3).eval().tolist(), [3, 7, 2])
+
+  def testNonRaggedTensor(self):
+    dt = [[0, 1, 2], [3, 4, 5], [6, 7, 8], [9, 10, 11]]
+    with self.test_session():
+      self.assertEqual(ragged.bounding_shape(dt).eval().tolist(), [4, 3])
+
+  def testExplicitAxisOptimizations(self):
+    rt = ragged.from_row_splits(b'a b c d e f g'.split(), [0, 2, 5, 6, 6, 7])
+    with self.test_session():
+      self.assertEqual(ragged.bounding_shape(rt, 0).eval().tolist(), 5)
+      self.assertEqual(ragged.bounding_shape(rt, 1).eval().tolist(), 3)
+      self.assertEqual(
+          ragged.bounding_shape(rt, [1, 0]).eval().tolist(), [3, 5])
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/python/ops/ragged/ragged_tensor_test.py b/tensorflow/python/ops/ragged/ragged_tensor_test.py
new file mode 100644
index 00000000000..61bfcb68090
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_tensor_test.py
@@ -0,0 +1,1208 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for third_party.tensorflow.python.ops.ragged_tensor."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import re
+import sys
+
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import ragged
+from tensorflow.python.platform import googletest
+
+
+class _SliceBuilder(object):
+  """Helper to construct arguments for __getitem__.
+
+  Usage: _SliceBuilder()[<expr>] slice_spec Python generates for <expr>.
+  """
+
+  def __getitem__(self, slice_spec):
+    return slice_spec
+
+
+SLICE_BUILDER = _SliceBuilder()
+
+
+def _make_tensor_slice_spec(slice_spec, use_constant=True):
+  """Wraps all integers in an extended slice spec w/ a tensor.
+
+  This function is used to help test slicing when the slice spec contains
+  tensors, rather than integers.
+
+  Args:
+    slice_spec: The extended slice spec.
+    use_constant: If true, then wrap each integer with a tf.constant.  If false,
+      then wrap each integer with a tf.placeholder.
+
+  Returns:
+    A copy of slice_spec, but with each integer i replaced with tf.constant(i).
+  """
+
+  def make_piece_scalar(piece):
+    if isinstance(piece, int):
+      scalar = constant_op.constant(piece)
+      if use_constant:
+        return scalar
+      else:
+        return array_ops.placeholder_with_default(scalar, [])
+    elif isinstance(piece, slice):
+      return slice(
+          make_piece_scalar(piece.start), make_piece_scalar(piece.stop),
+          make_piece_scalar(piece.step))
+    else:
+      return piece
+
+  if isinstance(slice_spec, tuple):
+    return tuple(make_piece_scalar(piece) for piece in slice_spec)
+  else:
+    return make_piece_scalar(slice_spec)
+
+
+# Example 2D ragged tensor value with one ragged dimension and with scalar
+# values, expressed as nested python lists and as splits+values.
+EXAMPLE_RAGGED_TENSOR_2D = [[b'a', b'b'], [b'c', b'd', b'e'], [b'f'], [],
+                            [b'g']]
+EXAMPLE_RAGGED_TENSOR_2D_SPLITS = [0, 2, 5, 6, 6, 7]
+EXAMPLE_RAGGED_TENSOR_2D_VALUES = ['a', 'b', 'c', 'd', 'e', 'f', 'g']
+
+# Example 4D ragged tensor value, with two ragged dimensions and with values
+# whose shape is [2], expressed as nested python lists and as splits+values.
+EXAMPLE_RAGGED_TENSOR_4D = [
+    [                                       # rt[0]
+        [[1, 2], [3, 4], [5, 6]],           # rt[0][0]
+        [[7, 8], [9, 10], [11, 12]]],       # rt[0][1]
+    [],                                     # rt[1]
+    [                                       # rt[2]
+        [[13, 14], [15, 16], [17, 18]]],    # rt[2][0]
+    [                                       # rt[3]
+        [[19, 20]]]                         # rt[3][0]
+]  # pyformat: disable
+EXAMPLE_RAGGED_TENSOR_4D_SPLITS1 = [0, 2, 2, 3, 4]
+EXAMPLE_RAGGED_TENSOR_4D_SPLITS2 = [0, 3, 6, 9, 10]
+EXAMPLE_RAGGED_TENSOR_4D_VALUES = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10],
+                                   [11, 12], [13, 14], [15, 16], [17,
+                                                                  18], [19, 20]]
+
+
+class RaggedTensorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
+  longMessage = True  # Property in unittest.Testcase. pylint: disable=invalid-name
+
+  #=============================================================================
+  # RaggedTensor class docstring examples
+  #=============================================================================
+
+  def testClassDocStringExamples(self):
+    # From section: "Component Tensors"
+    rt = ragged.from_row_splits(
+        values=[3, 1, 4, 1, 5, 9, 2, 6], row_splits=[0, 4, 4, 7, 8, 8])
+    with self.test_session():
+      self.assertEqual(rt.tolist(),
+                       [[3, 1, 4, 1], [], [5, 9, 2], [6], []])
+    del rt
+
+    # From section: "Alternative Row-Partitioning Schemes"
+    values = [3, 1, 4, 1, 5, 9, 2, 6]
+    rt1 = ragged.from_row_splits(values, row_splits=[0, 4, 4, 7, 8, 8])
+    rt2 = ragged.from_row_lengths(values, row_lengths=[4, 0, 3, 1, 0])
+    rt3 = ragged.from_value_rowids(
+        values, value_rowids=[0, 0, 0, 0, 2, 2, 2, 3], nrows=5)
+    rt4 = ragged.from_row_starts(values, row_starts=[0, 4, 4, 7, 8])
+    rt5 = ragged.from_row_limits(values, row_limits=[4, 4, 7, 8, 8])
+    for rt in (rt1, rt2, rt3, rt4, rt5):
+      with self.test_session():
+        self.assertEqual(rt.tolist(),
+                         [[3, 1, 4, 1], [], [5, 9, 2], [6], []])
+    del rt1, rt2, rt3, rt4, rt5
+
+    # From section: "Multiple Ragged Dimensions"
+    inner_rt = ragged.from_row_splits(
+        values=[3, 1, 4, 1, 5, 9, 2, 6], row_splits=[0, 4, 4, 7, 8, 8])
+    outer_rt = ragged.from_row_splits(values=inner_rt, row_splits=[0, 3, 3, 5])
+    self.assertEqual(outer_rt.ragged_rank, 2)
+    with self.test_session():
+      self.assertEqual(outer_rt.tolist(),
+                       [[[3, 1, 4, 1], [], [5, 9, 2]], [], [[6], []]])
+    del inner_rt, outer_rt
+
+    # From section: "Multiple Ragged Dimensions"
+    rt = ragged.from_nested_row_splits(
+        inner_values=[3, 1, 4, 1, 5, 9, 2, 6],
+        nested_row_splits=([0, 3, 3, 5], [0, 4, 4, 7, 8, 8]))
+    with self.test_session():
+      self.assertEqual(rt.tolist(),
+                       [[[3, 1, 4, 1], [], [5, 9, 2]], [], [[6], []]])
+    del rt
+
+    # From section: "Uniform Inner Dimensions"
+    rt = ragged.from_row_splits(
+        values=array_ops.ones([5, 3]), row_splits=[0, 2, 5])
+    with self.test_session():
+      self.assertEqual(
+          rt.tolist(),
+          [[[1, 1, 1], [1, 1, 1]], [[1, 1, 1], [1, 1, 1], [1, 1, 1]]])
+      self.assertEqual(rt.shape.as_list(), [2, None, 3])
+    del rt
+
+  #=============================================================================
+  # RaggedTensorValue Constructor
+  #=============================================================================
+
+  def testRaggedTensorValueConstruction(self):
+    values = np.array(b'a b c d e f g'.split())
+    splits = np.array([0, 2, 5, 6, 6, 7], dtype=np.int64)
+    splits2 = np.array([0, 3, 5], dtype=np.int64)
+
+    # Test construction of a RaggedTensorValue with ragged_rank=1.
+    rt_value = ragged.RaggedTensorValue(values, splits)
+    self.assertEqual(rt_value.row_splits.dtype, np.int64)
+    self.assertEqual(rt_value.shape, (5, None))
+    self.assertEqual(len(rt_value.nested_row_splits), 1)
+    self.assertAllEqual(splits, rt_value.row_splits)
+    self.assertAllEqual(values, rt_value.values)
+    self.assertAllEqual(splits, rt_value.nested_row_splits[0])
+    self.assertAllEqual(values, rt_value.inner_values)
+
+    # Test construction of a RaggedTensorValue with ragged_rank=2.
+    rt_value = ragged.RaggedTensorValue(
+        values=ragged.RaggedTensorValue(values, splits), row_splits=splits2)
+    self.assertEqual(rt_value.row_splits.dtype, np.int64)
+    self.assertEqual(rt_value.shape, (2, None, None))
+    self.assertEqual(len(rt_value.nested_row_splits), 2)
+    self.assertAllEqual(splits2, rt_value.row_splits)
+    self.assertAllEqual(splits, rt_value.values.row_splits)
+    self.assertAllEqual(splits2, rt_value.nested_row_splits[0])
+    self.assertAllEqual(splits, rt_value.nested_row_splits[1])
+    self.assertAllEqual(values, rt_value.values.values)
+    self.assertAllEqual(values, rt_value.inner_values)
+
+  #=============================================================================
+  # RaggedTensor Constructor (private)
+  #=============================================================================
+
+  def testRaggedTensorConstruction(self):
+    values = constant_op.constant(['a', 'b', 'c', 'd', 'e', 'f', 'g'])
+    row_splits = constant_op.constant([0, 2, 2, 5, 6, 7], dtypes.int64)
+    rt = ragged.RaggedTensor(
+        values=values, row_splits=row_splits, internal=True)
+
+    with self.test_session():
+      self.assertEqual(rt.tolist(),
+                       [[b'a', b'b'], [], [b'c', b'd', b'e'], [b'f'], [b'g']])
+
+  def testRaggedTensorConstructionErrors(self):
+    values = constant_op.constant(['a', 'b', 'c', 'd', 'e', 'f', 'g'])
+    row_splits = constant_op.constant([0, 2, 2, 5, 6, 7], dtypes.int64)
+
+    with self.assertRaisesRegexp(ValueError,
+                                 'RaggedTensor constructor is private'):
+      ragged.RaggedTensor(values=values, row_splits=row_splits)
+
+    with self.assertRaisesRegexp(TypeError,
+                                 'values must be a Tensor or RaggedTensor'):
+      ragged.RaggedTensor(values=range(7), row_splits=row_splits, internal=True)
+
+    with self.assertRaisesRegexp(TypeError,
+                                 'Row-partitioning argument must be a Tensor'):
+      ragged.RaggedTensor(
+          values=values, row_splits=[0, 2, 2, 5, 6, 7], internal=True)
+
+    with self.assertRaisesRegexp(ValueError,
+                                 r'Shape \(6, 1\) must have rank 1'):
+      ragged.RaggedTensor(
+          values=values,
+          row_splits=array_ops.expand_dims(row_splits, 1),
+          internal=True)
+
+    with self.assertRaisesRegexp(TypeError,
+                                 'Cached value must be a Tensor or None.'):
+      ragged.RaggedTensor(values=values, row_splits=row_splits,
+                          cached_row_lengths=[2, 3, 4], internal=True)
+
+
+#=============================================================================
+# RaggedTensor Factory Ops
+#=============================================================================
+
+  def testFromValueRowIdsWithDerivedNRows(self):
+    # nrows is known at graph creation time.
+    values = constant_op.constant(['a', 'b', 'c', 'd', 'e', 'f', 'g'])
+    value_rowids = constant_op.constant([0, 0, 2, 2, 2, 3, 4], dtypes.int64)
+
+    rt = ragged.from_value_rowids(values, value_rowids)
+    self.assertEqual(rt.dtype, dtypes.string)
+    self.assertEqual(rt.shape.as_list(), [5, None])
+    self.assertEqual(rt.ragged_rank, 1)
+
+    rt_values = rt.values
+    rt_value_rowids = ragged.value_rowids(rt)
+    rt_nrows = ragged.nrows(rt)
+
+    self.assertIs(rt_values, values)
+    self.assertIs(rt_value_rowids, value_rowids)  # cached_value_rowids
+    with self.test_session():
+      self.assertAllEqual(rt_value_rowids, value_rowids)
+      self.assertEqual(rt_nrows.eval(), 5)
+      self.assertEqual(rt.tolist(),
+                       [[b'a', b'b'], [], [b'c', b'd', b'e'], [b'f'], [b'g']])
+
+  def testFromValueRowIdsWithDerivedNRowsDynamic(self):
+    # nrows is not known at graph creation time.
+    values = constant_op.constant(['a', 'b', 'c', 'd', 'e', 'f', 'g'])
+    value_rowids = constant_op.constant([0, 0, 2, 2, 2, 3, 4], dtypes.int64)
+    value_rowids = array_ops.placeholder_with_default(value_rowids, shape=None)
+
+    rt = ragged.from_value_rowids(values, value_rowids)
+    self.assertEqual(rt.dtype, dtypes.string)
+    self.assertEqual(rt.shape.as_list(), [None, None])
+    self.assertEqual(rt.ragged_rank, 1)
+
+    rt_values = rt.values
+    rt_value_rowids = ragged.value_rowids(rt)
+    rt_nrows = ragged.nrows(rt)
+
+    self.assertIs(rt_values, values)
+    self.assertIs(rt_value_rowids, value_rowids)  # cached_value_rowids
+    with self.test_session():
+      self.assertAllEqual(rt_value_rowids, value_rowids)
+      self.assertEqual(rt_nrows.eval(), 5)
+      self.assertEqual(rt.tolist(),
+                       [[b'a', b'b'], [], [b'c', b'd', b'e'], [b'f'], [b'g']])
+
+  def testFromValueRowIdsWithExplicitNRows(self):
+    values = constant_op.constant(['a', 'b', 'c', 'd', 'e', 'f', 'g'])
+    value_rowids = constant_op.constant([0, 0, 2, 2, 2, 3, 4], dtypes.int64)
+    nrows = constant_op.constant(7, dtypes.int64)
+
+    rt = ragged.from_value_rowids(values, value_rowids, nrows)
+    self.assertEqual(rt.dtype, dtypes.string)
+    self.assertEqual(rt.shape.as_list(), [7, None])
+    self.assertEqual(rt.ragged_rank, 1)
+
+    rt_values = rt.values
+    rt_value_rowids = ragged.value_rowids(rt)
+    rt_nrows = ragged.nrows(rt)
+
+    self.assertIs(rt_values, values)
+    self.assertIs(rt_value_rowids, value_rowids)  # cached_value_rowids
+    self.assertIs(rt_nrows, nrows)  # cached_nrows
+    with self.test_session():
+      self.assertEqual(
+          rt.tolist(),
+          [[b'a', b'b'], [], [b'c', b'd', b'e'], [b'f'], [b'g'], [], []])
+
+  def testFromValueRowIdsWithExplicitNRowsEqualToDefault(self):
+    values = constant_op.constant(['a', 'b', 'c', 'd', 'e', 'f', 'g'])
+    value_rowids = constant_op.constant([0, 0, 2, 2, 2, 3, 4], dtypes.int64)
+    nrows = constant_op.constant(5, dtypes.int64)
+
+    rt = ragged.from_value_rowids(values, value_rowids, nrows)
+    self.assertEqual(rt.dtype, dtypes.string)
+    self.assertEqual(rt.shape.as_list(), [5, None])
+    self.assertEqual(rt.ragged_rank, 1)
+
+    rt_values = rt.values
+    rt_value_rowids = ragged.value_rowids(rt)
+    rt_nrows = ragged.nrows(rt)
+
+    self.assertIs(rt_values, values)
+    self.assertIs(rt_value_rowids, value_rowids)  # cached_value_rowids
+    self.assertIs(rt_nrows, nrows)  # cached_nrows
+    with self.test_session():
+      self.assertAllEqual(rt_value_rowids, value_rowids)
+      self.assertAllEqual(rt_nrows, nrows)
+      self.assertEqual(rt.tolist(),
+                       [[b'a', b'b'], [], [b'c', b'd', b'e'], [b'f'], [b'g']])
+
+  def testFromValueRowIdsWithEmptyValues(self):
+    rt = ragged.from_value_rowids([], [])
+    rt_nrows = ragged.nrows(rt)
+    self.assertEqual(rt.dtype, dtypes.float32)
+    self.assertEqual(rt.shape.as_list(), [0, None])
+    self.assertEqual(rt.ragged_rank, 1)
+    self.assertEqual(rt.values.shape.as_list(), [0])
+    self.assertEqual(ragged.value_rowids(rt).shape.as_list(), [0])
+    with self.test_session():
+      self.assertEqual(rt_nrows.eval().tolist(), 0)
+      self.assertEqual(rt.tolist(), [])
+
+  def testFromRowSplits(self):
+    values = constant_op.constant(['a', 'b', 'c', 'd', 'e', 'f', 'g'])
+    row_splits = constant_op.constant([0, 2, 2, 5, 6, 7], dtypes.int64)
+
+    rt = ragged.from_row_splits(values, row_splits)
+    self.assertEqual(rt.dtype, dtypes.string)
+    self.assertEqual(rt.shape.as_list(), [5, None])
+    self.assertEqual(rt.ragged_rank, 1)
+
+    rt_values = rt.values
+    rt_row_splits = rt.row_splits
+    rt_nrows = ragged.nrows(rt)
+
+    self.assertIs(rt_values, values)
+    self.assertIs(rt_row_splits, row_splits)
+    with self.test_session():
+      self.assertEqual(rt_nrows.eval(), 5)
+      self.assertEqual(rt.tolist(),
+                       [[b'a', b'b'], [], [b'c', b'd', b'e'], [b'f'], [b'g']])
+
+  def testFromRowSplitsWithEmptySplits(self):
+    err_msg = 'row_splits tensor may not be empty'
+    with self.assertRaisesRegexp(ValueError, err_msg):
+      ragged.from_row_splits([], [])
+
+  def testFromRowStarts(self):
+    values = constant_op.constant(['a', 'b', 'c', 'd', 'e', 'f', 'g'])
+    row_starts = constant_op.constant([0, 2, 2, 5, 6], dtypes.int64)
+
+    rt = ragged.from_row_starts(values, row_starts)
+    self.assertEqual(rt.dtype, dtypes.string)
+    self.assertEqual(rt.shape.as_list(), [5, None])
+    self.assertEqual(rt.ragged_rank, 1)
+
+    rt_values = rt.values
+    rt_row_starts = ragged.row_starts(rt)
+    rt_nrows = ragged.nrows(rt)
+
+    self.assertIs(rt_values, values)
+    with self.test_session():
+      self.assertEqual(rt_nrows.eval(), 5)
+      self.assertAllEqual(rt_row_starts, row_starts)
+      self.assertEqual(rt.tolist(),
+                       [[b'a', b'b'], [], [b'c', b'd', b'e'], [b'f'], [b'g']])
+
+  def testFromRowLimits(self):
+    values = constant_op.constant(['a', 'b', 'c', 'd', 'e', 'f', 'g'])
+    row_limits = constant_op.constant([2, 2, 5, 6, 7], dtypes.int64)
+
+    rt = ragged.from_row_limits(values, row_limits)
+    self.assertEqual(rt.dtype, dtypes.string)
+    self.assertEqual(rt.shape.as_list(), [5, None])
+    self.assertEqual(rt.ragged_rank, 1)
+
+    rt_values = rt.values
+    rt_row_limits = ragged.row_limits(rt)
+    rt_nrows = ragged.nrows(rt)
+
+    self.assertIs(rt_values, values)
+    with self.test_session():
+      self.assertEqual(rt_nrows.eval(), 5)
+      self.assertAllEqual(rt_row_limits, row_limits)
+      self.assertEqual(rt.tolist(),
+                       [[b'a', b'b'], [], [b'c', b'd', b'e'], [b'f'], [b'g']])
+
+  def testFromRowLengths(self):
+    values = constant_op.constant(['a', 'b', 'c', 'd', 'e', 'f', 'g'])
+    row_lengths = constant_op.constant([2, 0, 3, 1, 1], dtypes.int64)
+
+    rt = ragged.from_row_lengths(values, row_lengths)
+    self.assertEqual(rt.dtype, dtypes.string)
+    self.assertEqual(rt.shape.as_list(), [5, None])
+    self.assertEqual(rt.ragged_rank, 1)
+
+    rt_values = rt.values
+    rt_row_lengths = ragged.row_lengths(rt)
+    rt_nrows = ragged.nrows(rt)
+
+    self.assertIs(rt_values, values)
+    self.assertIs(rt_row_lengths, row_lengths)  # cached_nrows
+    with self.test_session():
+      self.assertEqual(rt_nrows.eval(), 5)
+      self.assertAllEqual(rt_row_lengths, row_lengths)
+      self.assertEqual(rt.tolist(),
+                       [[b'a', b'b'], [], [b'c', b'd', b'e'], [b'f'], [b'g']])
+
+  def testFromNestedValueRowIdsWithDerivedNRows(self):
+    values = constant_op.constant(['a', 'b', 'c', 'd', 'e', 'f', 'g'])
+    nested_value_rowids = [
+        constant_op.constant([0, 0, 1, 3, 3], dtypes.int64),
+        constant_op.constant([0, 0, 2, 2, 2, 3, 4], dtypes.int64)
+    ]
+
+    rt = ragged.from_nested_value_rowids(values, nested_value_rowids)
+    self.assertEqual(rt.dtype, dtypes.string)
+    self.assertEqual(rt.shape.as_list(), [4, None, None])
+    self.assertEqual(rt.ragged_rank, 2)
+
+    rt_values = rt.values
+    rt_value_rowids = ragged.value_rowids(rt)
+    rt_values_values = rt_values.values
+    rt_values_value_rowids = ragged.value_rowids(rt_values)
+
+    self.assertIs(rt_values_values, values)
+    with self.test_session():
+      self.assertAllEqual(rt_value_rowids, nested_value_rowids[0])
+      self.assertAllEqual(rt_values_value_rowids, nested_value_rowids[1])
+      self.assertEqual(
+          rt.tolist(),
+          [[[b'a', b'b'], []], [[b'c', b'd', b'e']], [], [[b'f'], [b'g']]])
+
+  def testFromNestedValueRowIdsWithExplicitNRows(self):
+    values = constant_op.constant(['a', 'b', 'c', 'd', 'e', 'f', 'g'])
+    nested_value_rowids = [
+        constant_op.constant([0, 0, 1, 3, 3, 3], dtypes.int64),
+        constant_op.constant([0, 0, 2, 2, 2, 3, 4], dtypes.int64)
+    ]
+    nrows = [
+        constant_op.constant(6, dtypes.int64),
+        constant_op.constant(6, dtypes.int64)
+    ]
+
+    rt = ragged.from_nested_value_rowids(values, nested_value_rowids, nrows)
+    self.assertEqual(rt.dtype, dtypes.string)
+    self.assertEqual(rt.shape.as_list(), [6, None, None])
+    self.assertEqual(rt.ragged_rank, 2)
+
+    rt_values = rt.values
+    rt_value_rowids = ragged.value_rowids(rt)
+    rt_nrows = ragged.nrows(rt)
+    rt_values_values = rt_values.values
+    rt_values_value_rowids = ragged.value_rowids(rt_values)
+    rt_values_nrows = ragged.nrows(rt_values)
+
+    self.assertIs(rt_values_values, values)
+    with self.test_session():
+      self.assertAllEqual(rt_value_rowids, nested_value_rowids[0])
+      self.assertAllEqual(rt_values_value_rowids, nested_value_rowids[1])
+      self.assertAllEqual(rt_nrows, nrows[0])
+      self.assertAllEqual(rt_values_nrows, nrows[1])
+      self.assertEqual(rt.tolist(),
+                       [[[b'a', b'b'], []], [[b'c', b'd', b'e']], [],
+                        [[b'f'], [b'g'], []], [], []])
+
+  def testFromNestedValueRowIdsWithExplicitNRowsMismatch(self):
+    values = constant_op.constant(['a', 'b', 'c', 'd', 'e', 'f', 'g'])
+    nested_value_rowids = [
+        constant_op.constant([0, 0, 1, 3, 3, 3], dtypes.int64),
+        constant_op.constant([0, 0, 2, 2, 2, 3, 4], dtypes.int64)
+    ]
+    nrows = [constant_op.constant(6, dtypes.int64)]
+    with self.assertRaisesRegexp(
+        ValueError, 'nested_nrows must have the same '
+        'length as nested_value_rowids'):
+      ragged.from_nested_value_rowids(values, nested_value_rowids, nrows)
+
+  def testFromNestedValueRowIdsWithNonListInput(self):
+    with self.assertRaisesRegexp(
+        TypeError, 'nested_value_rowids must be a list of Tensors'):
+      ragged.from_nested_value_rowids([1, 2, 3],
+                                      constant_op.constant(
+                                          [[0, 1, 2], [0, 1, 2]], dtypes.int64))
+    with self.assertRaisesRegexp(TypeError,
+                                 'nested_nrows must be a list of Tensors'):
+      ragged.from_nested_value_rowids([1, 2, 3], [[0, 1, 2], [0, 1, 2]],
+                                      constant_op.constant([3, 3]))
+
+  def testFromNestedRowSplits(self):
+    inner_values = constant_op.constant(['a', 'b', 'c', 'd', 'e', 'f', 'g'])
+    nested_row_splits = [
+        constant_op.constant([0, 2, 3, 3, 5], dtypes.int64),
+        constant_op.constant([0, 2, 2, 5, 6, 7], dtypes.int64)
+    ]
+
+    rt = ragged.from_nested_row_splits(inner_values, nested_row_splits)
+    self.assertEqual(rt.dtype, dtypes.string)
+    self.assertEqual(rt.shape.as_list(), [4, None, None])
+    self.assertEqual(rt.ragged_rank, 2)
+
+    rt_values = rt.values
+    rt_row_splits = rt.row_splits
+    rt_values_values = rt_values.values
+    rt_values_row_splits = rt_values.row_splits
+
+    self.assertIs(rt_values_values, inner_values)
+    self.assertIs(rt_row_splits, nested_row_splits[0])
+    self.assertIs(rt_values_row_splits, nested_row_splits[1])
+    with self.test_session():
+      self.assertEqual(
+          rt.tolist(),
+          [[[b'a', b'b'], []], [[b'c', b'd', b'e']], [], [[b'f'], [b'g']]])
+
+  def testFromNestedRowSplitsWithNonListInput(self):
+    with self.assertRaisesRegexp(TypeError,
+                                 'nested_row_splits must be a list of Tensors'):
+      ragged.from_nested_row_splits([1, 2],
+                                    constant_op.constant([[0, 1, 2], [0, 1, 2]],
+                                                         dtypes.int64))
+
+  def testFromValueRowIdsWithBadNRows(self):
+    values = constant_op.constant(['a', 'b', 'c', 'd', 'e', 'f', 'g'])
+    value_rowids = constant_op.constant([0, 0, 2, 2, 2, 3, 4], dtypes.int64)
+    nrows = constant_op.constant(5, dtypes.int64)
+
+    with self.assertRaisesRegexp(ValueError, r'Expected nrows >= 0; got -2'):
+      ragged.from_value_rowids(
+          values=values,
+          value_rowids=array_ops.placeholder_with_default(value_rowids, None),
+          nrows=-2)
+
+    with self.assertRaisesRegexp(
+        ValueError, r'Expected nrows >= value_rowids\[-1\] \+ 1; got nrows=2, '
+        r'value_rowids\[-1\]=4'):
+      ragged.from_value_rowids(
+          values=values, value_rowids=value_rowids, nrows=2)
+
+    with self.assertRaisesRegexp(
+        ValueError, r'Expected nrows >= value_rowids\[-1\] \+ 1; got nrows=4, '
+        r'value_rowids\[-1\]=4'):
+      ragged.from_value_rowids(
+          values=values, value_rowids=value_rowids, nrows=4)
+
+    with self.assertRaisesRegexp(ValueError,
+                                 r'Shape \(7, 1\) must have rank 1'):
+      ragged.from_value_rowids(
+          values=values,
+          value_rowids=array_ops.expand_dims(value_rowids, 1),
+          nrows=nrows)
+
+    with self.assertRaisesRegexp(ValueError, r'Shape \(1,\) must have rank 0'):
+      ragged.from_value_rowids(
+          values=values,
+          value_rowids=value_rowids,
+          nrows=array_ops.expand_dims(nrows, 0))
+
+  def testGraphMismatch(self):
+    with ops.Graph().as_default():
+      values = constant_op.constant([1, 2, 3])
+    with ops.Graph().as_default():
+      splits = constant_op.constant([0, 2, 3])
+    self.assertRaisesRegexp(ValueError, '.* must be from the same graph as .*',
+                            ragged.from_row_splits, values, splits)
+
+  #=============================================================================
+  # Ragged Value & Row-Partitioning Tensor Accessors
+  #=============================================================================
+
+  def testRaggedTensorAccessors_2d(self):
+    values = constant_op.constant(['a', 'b', 'c', 'd', 'e', 'f', 'g'])
+    row_splits = constant_op.constant([0, 2, 2, 5, 6, 7], dtypes.int64)
+    value_rowids = constant_op.constant([0, 0, 2, 2, 2, 3, 4], dtypes.int64)
+    rt1 = ragged.from_row_splits(values, row_splits)
+    rt2 = ragged.from_value_rowids(values, value_rowids)
+
+    for rt in [rt1, rt2]:
+      with self.test_session():
+        self.assertEqual(rt.tolist(),
+                         [[b'a', b'b'], [], [b'c', b'd', b'e'], [b'f'], [b'g']])
+        self.assertEqual(rt.values.eval().tolist(),
+                         [b'a', b'b', b'c', b'd', b'e', b'f', b'g'])
+        self.assertEqual(rt.values.shape.dims[0].value, 7)
+        self.assertEqual(
+            ragged.value_rowids(rt).eval().tolist(), [0, 0, 2, 2, 2, 3, 4])
+        self.assertEqual(ragged.nrows(rt).eval().tolist(), 5)
+        self.assertEqual(rt.row_splits.eval().tolist(), [0, 2, 2, 5, 6, 7])
+        self.assertEqual(ragged.row_starts(rt).eval().tolist(), [0, 2, 2, 5, 6])
+        self.assertEqual(ragged.row_limits(rt).eval().tolist(), [2, 2, 5, 6, 7])
+        self.assertEqual(
+            ragged.row_lengths(rt).eval().tolist(), [2, 0, 3, 1, 1])
+        self.assertEqual(rt.inner_values.eval().tolist(),
+                         [b'a', b'b', b'c', b'd', b'e', b'f', b'g'])
+        self.assertEqual([s.eval().tolist() for s in rt.nested_row_splits],
+                         [[0, 2, 2, 5, 6, 7]])
+
+  def testRaggedTensorAccessors_3d_with_ragged_rank_1(self):
+    values = [[0, 1], [2, 3], [4, 5], [6, 7], [8, 9], [10, 11], [12, 13]]
+    row_splits = constant_op.constant([0, 2, 2, 5, 6, 7], dtypes.int64)
+    value_rowids = constant_op.constant([0, 0, 2, 2, 2, 3, 4], dtypes.int64)
+    rt1 = ragged.from_row_splits(values, row_splits)
+    rt2 = ragged.from_value_rowids(values, value_rowids)
+
+    for rt in [rt1, rt2]:
+      with self.test_session():
+        self.assertEqual(rt.tolist(),
+                         [[[0, 1], [2, 3]], [], [[4, 5], [6, 7], [8, 9]],
+                          [[10, 11]], [[12, 13]]])
+        self.assertEqual(
+            rt.values.eval().tolist(),
+            [[0, 1], [2, 3], [4, 5], [6, 7], [8, 9], [10, 11], [12, 13]])
+        self.assertEqual(rt.values.shape.dims[0].value, 7)
+        self.assertEqual(
+            ragged.value_rowids(rt).eval().tolist(), [0, 0, 2, 2, 2, 3, 4])
+        self.assertEqual(ragged.nrows(rt).eval().tolist(), 5)
+        self.assertEqual(rt.row_splits.eval().tolist(), [0, 2, 2, 5, 6, 7])
+        self.assertEqual(ragged.row_starts(rt).eval().tolist(), [0, 2, 2, 5, 6])
+        self.assertEqual(ragged.row_limits(rt).eval().tolist(), [2, 2, 5, 6, 7])
+        self.assertEqual(
+            ragged.row_lengths(rt).eval().tolist(), [2, 0, 3, 1, 1])
+        self.assertEqual(
+            rt.inner_values.eval().tolist(),
+            [[0, 1], [2, 3], [4, 5], [6, 7], [8, 9], [10, 11], [12, 13]])
+        self.assertEqual([s.eval().tolist() for s in rt.nested_row_splits],
+                         [[0, 2, 2, 5, 6, 7]])
+
+  def testRaggedTensorAccessors_3d_with_ragged_rank_2(self):
+    values = constant_op.constant(['a', 'b', 'c', 'd', 'e', 'f', 'g'])
+    nested_row_splits = [
+        constant_op.constant([0, 2, 3, 3, 5], dtypes.int64),
+        constant_op.constant([0, 2, 2, 5, 6, 7], dtypes.int64)
+    ]
+    nested_value_rowids = [
+        constant_op.constant([0, 0, 1, 3, 3], dtypes.int64),
+        constant_op.constant([0, 0, 2, 2, 2, 3, 4], dtypes.int64)
+    ]
+    rt1 = ragged.from_nested_row_splits(values, nested_row_splits)
+    rt2 = ragged.from_nested_value_rowids(values, nested_value_rowids)
+
+    for rt in [rt1, rt2]:
+      with self.test_session():
+        self.assertEqual(
+            rt.tolist(),
+            [[[b'a', b'b'], []], [[b'c', b'd', b'e']], [], [[b'f'], [b'g']]])
+        self.assertEqual(rt.values.eval().tolist(),
+                         [[b'a', b'b'], [], [b'c', b'd', b'e'], [b'f'], [b'g']])
+        self.assertEqual(rt.values.shape.dims[0].value, 5)
+        self.assertEqual(
+            ragged.value_rowids(rt).eval().tolist(), [0, 0, 1, 3, 3])
+        self.assertEqual(ragged.nrows(rt).eval().tolist(), 4)
+        self.assertEqual(rt.row_splits.eval().tolist(), [0, 2, 3, 3, 5])
+        self.assertEqual(ragged.row_starts(rt).eval().tolist(), [0, 2, 3, 3])
+        self.assertEqual(ragged.row_limits(rt).eval().tolist(), [2, 3, 3, 5])
+        self.assertEqual(ragged.row_lengths(rt).eval().tolist(), [2, 1, 0, 2])
+        self.assertEqual(rt.inner_values.eval().tolist(),
+                         [b'a', b'b', b'c', b'd', b'e', b'f', b'g'])
+        self.assertEqual([s.eval().tolist() for s in rt.nested_row_splits],
+                         [[0, 2, 3, 3, 5], [0, 2, 2, 5, 6, 7]])
+
+  def testNRowsWithTensorInput(self):
+    dt = constant_op.constant([[1, 2, 3], [4, 5, 6]])
+    nrows = ragged.nrows(dt)
+    with self.test_session():
+      self.assertEqual(nrows.eval(), 2)
+
+  def testRowLengthsWithTensorInput(self):
+    dt = constant_op.constant([[1, 2, 3], [4, 5, 6]])
+    row_lengths = ragged.row_lengths(dt)
+    with self.test_session():
+      self.assertEqual(row_lengths.eval().tolist(), [3, 3])
+
+  #=============================================================================
+  # RaggedTensor.shape
+  #=============================================================================
+
+  def testShape(self):
+    """Tests for RaggedTensor.shape."""
+    rt1 = ragged.from_row_splits(b'a b c d e f g'.split(), [0, 2, 5, 6, 6, 7])
+    self.assertEqual(rt1.shape.as_list(), [5, None])
+
+    rt2 = ragged.from_row_splits(
+        [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12], [13, 14]],
+        [0, 2, 5, 6, 6, 7])
+    self.assertEqual(rt2.shape.as_list(), [5, None, 2])
+
+    rt3 = ragged.from_row_splits(
+        [[[1, 2], [3, 4]], [[5, 6], [7, 8]], [[9, 10], [11, 12]]], [0, 2, 2, 3])
+    self.assertEqual(rt3.shape.as_list(), [3, None, 2, 2])
+
+    rt4 = ragged.from_row_splits(rt3, [0, 1, 3, 3])
+    self.assertEqual(rt4.shape.as_list(), [3, None, None, 2, 2])
+
+    rt5 = ragged.from_row_splits(
+        array_ops.placeholder(dtype=dtypes.string), [0, 2, 3, 5])
+    self.assertEqual(rt5.shape.ndims, None)
+
+    rt6 = ragged.from_row_splits([1, 2, 3],
+                                 array_ops.placeholder(dtype=dtypes.int64))
+    self.assertEqual(rt6.shape.as_list(), [None, None])
+
+  #=============================================================================
+  # RaggedTensor.__getitem__
+  #=============================================================================
+
+  def _TestGetItem(self, rt, slice_spec, expected):
+    """Helper function for testing RaggedTensor.__getitem__.
+
+    Checks that calling `rt.__getitem__(slice_spec) returns the expected value.
+    Checks three different configurations for each slice spec:
+
+      * Call __getitem__ with the slice spec as-is (with int values)
+      * Call __getitem__ with int values in the slice spec wrapped in
+        `tf.constant()`.
+      * Call __getitem__ with int values in the slice spec wrapped in
+        `tf.placeholder()` (so value is not known at graph construction time).
+
+    Args:
+      rt: The RaggedTensor to test.
+      slice_spec: The slice spec.
+      expected: The expected value of rt.__getitem__(slice_spec), as a python
+        list; or an exception class.
+    """
+    with self.test_session():
+      tensor_slice_spec1 = _make_tensor_slice_spec(slice_spec, True)
+      tensor_slice_spec2 = _make_tensor_slice_spec(slice_spec, False)
+      value1 = rt.__getitem__(slice_spec).eval()
+      value2 = rt.__getitem__(tensor_slice_spec1).eval()
+      value3 = rt.__getitem__(tensor_slice_spec2).eval()
+      if hasattr(value1, 'tolist'):
+        value1 = value1.tolist()
+      if hasattr(value2, 'tolist'):
+        value2 = value2.tolist()
+      if hasattr(value3, 'tolist'):
+        value3 = value3.tolist()
+      self.assertEqual(value1, expected, 'slice_spec=%s' % (slice_spec,))
+      self.assertEqual(value2, expected, 'slice_spec=%s' % (slice_spec,))
+      self.assertEqual(value3, expected, 'slice_spec=%s' % (slice_spec,))
+
+  def _TestGetItemException(self, rt, slice_spec, expected, message):
+    """Helper function for testing RaggedTensor.__getitem__ exceptions."""
+    with self.test_session():
+      tensor_slice_spec1 = _make_tensor_slice_spec(slice_spec, True)
+      self.assertRaisesRegexp(expected, message, rt.__getitem__, slice_spec)
+      self.assertRaisesRegexp(expected, message, rt.__getitem__,
+                              tensor_slice_spec1)
+
+  @parameterized.parameters(
+      # Tests for rt[i]
+      (SLICE_BUILDER[-5], EXAMPLE_RAGGED_TENSOR_2D[-5]),
+      (SLICE_BUILDER[-4], EXAMPLE_RAGGED_TENSOR_2D[-4]),
+      (SLICE_BUILDER[-1], EXAMPLE_RAGGED_TENSOR_2D[-1]),
+      (SLICE_BUILDER[0], EXAMPLE_RAGGED_TENSOR_2D[0]),
+      (SLICE_BUILDER[1], EXAMPLE_RAGGED_TENSOR_2D[1]),
+      (SLICE_BUILDER[4], EXAMPLE_RAGGED_TENSOR_2D[4]),
+
+      # Tests for rt[i:]
+      (SLICE_BUILDER[-6:], EXAMPLE_RAGGED_TENSOR_2D[-6:]),
+      (SLICE_BUILDER[-3:], EXAMPLE_RAGGED_TENSOR_2D[-3:]),
+      (SLICE_BUILDER[-1:], EXAMPLE_RAGGED_TENSOR_2D[-1:]),
+      (SLICE_BUILDER[0:], EXAMPLE_RAGGED_TENSOR_2D[0:]),
+      (SLICE_BUILDER[3:], EXAMPLE_RAGGED_TENSOR_2D[3:]),
+      (SLICE_BUILDER[5:], EXAMPLE_RAGGED_TENSOR_2D[5:]),
+
+      # Tests for rt[:j]
+      (SLICE_BUILDER[:-6], EXAMPLE_RAGGED_TENSOR_2D[:-6]),
+      (SLICE_BUILDER[:-3], EXAMPLE_RAGGED_TENSOR_2D[:-3]),
+      (SLICE_BUILDER[:-1], EXAMPLE_RAGGED_TENSOR_2D[:-1]),
+      (SLICE_BUILDER[:0], EXAMPLE_RAGGED_TENSOR_2D[:0]),
+      (SLICE_BUILDER[:3], EXAMPLE_RAGGED_TENSOR_2D[:3]),
+      (SLICE_BUILDER[:5], EXAMPLE_RAGGED_TENSOR_2D[:5]),
+
+      # Tests for rt[i:j]
+      (SLICE_BUILDER[0:3], EXAMPLE_RAGGED_TENSOR_2D[0:3]),
+      (SLICE_BUILDER[3:5], EXAMPLE_RAGGED_TENSOR_2D[3:5]),
+      (SLICE_BUILDER[-5:3], EXAMPLE_RAGGED_TENSOR_2D[-5:3]),
+      (SLICE_BUILDER[3:1], EXAMPLE_RAGGED_TENSOR_2D[3:1]),
+      (SLICE_BUILDER[-1:1], EXAMPLE_RAGGED_TENSOR_2D[-1:1]),
+      (SLICE_BUILDER[1:-1], EXAMPLE_RAGGED_TENSOR_2D[1:-1]),
+
+      # Tests for rt[i, j]
+      (SLICE_BUILDER[0, 1], EXAMPLE_RAGGED_TENSOR_2D[0][1]),
+      (SLICE_BUILDER[1, 2], EXAMPLE_RAGGED_TENSOR_2D[1][2]),
+      (SLICE_BUILDER[-1, 0], EXAMPLE_RAGGED_TENSOR_2D[-1][0]),
+      (SLICE_BUILDER[-3, 0], EXAMPLE_RAGGED_TENSOR_2D[-3][0]),
+      (SLICE_BUILDER[:], EXAMPLE_RAGGED_TENSOR_2D),
+      (SLICE_BUILDER[:, :], EXAMPLE_RAGGED_TENSOR_2D),
+
+      # Empty slice spec.
+      ([], EXAMPLE_RAGGED_TENSOR_2D),
+
+      # Test for ellipsis
+      (SLICE_BUILDER[...], EXAMPLE_RAGGED_TENSOR_2D),
+      (SLICE_BUILDER[2, ...], EXAMPLE_RAGGED_TENSOR_2D[2]),
+      (SLICE_BUILDER[..., :], EXAMPLE_RAGGED_TENSOR_2D),
+      (SLICE_BUILDER[..., 2, 0], EXAMPLE_RAGGED_TENSOR_2D[2][0]),
+      (SLICE_BUILDER[2, ..., 0], EXAMPLE_RAGGED_TENSOR_2D[2][0]),
+      (SLICE_BUILDER[2, 0, ...], EXAMPLE_RAGGED_TENSOR_2D[2][0]),
+
+      # Test for array_ops.newaxis
+      (SLICE_BUILDER[array_ops.newaxis, :], [EXAMPLE_RAGGED_TENSOR_2D]),
+      (SLICE_BUILDER[:, array_ops.newaxis],
+       [[row] for row in EXAMPLE_RAGGED_TENSOR_2D]),
+
+      # Slicing inner ragged dimensions.
+      (SLICE_BUILDER[-1:, 1:4],
+       [row[1:4] for row in EXAMPLE_RAGGED_TENSOR_2D[-1:]]),
+      (SLICE_BUILDER[:, 1:4], [row[1:4] for row in EXAMPLE_RAGGED_TENSOR_2D]),
+      (SLICE_BUILDER[:, -2:], [row[-2:] for row in EXAMPLE_RAGGED_TENSOR_2D]),
+      # TODO(edloper): Add tests for strided slices, once support is added.
+  )
+  def testRaggedTensorGetItemWithRaggedRank1(self, slice_spec, expected):
+    """Test that rt.__getitem__(slice_spec) == expected."""
+    # Ragged tensor
+    rt = ragged.from_row_splits(EXAMPLE_RAGGED_TENSOR_2D_VALUES,
+                                EXAMPLE_RAGGED_TENSOR_2D_SPLITS)
+
+    with self.test_session():
+      self.assertEqual(rt.tolist(), EXAMPLE_RAGGED_TENSOR_2D)
+    self._TestGetItem(rt, slice_spec, expected)
+
+  # pylint: disable=invalid-slice-index
+  @parameterized.parameters(
+      # Tests for out-of-bound errors
+      (SLICE_BUILDER[5], ValueError, '.*out of bounds.*'),
+      (SLICE_BUILDER[-6], ValueError, '.*out of bounds.*'),
+      (SLICE_BUILDER[0, 2], ValueError, '.*out of bounds.*'),
+      (SLICE_BUILDER[3, 0], ValueError, '.*out of bounds.*'),
+
+      # Indexing into an inner ragged dimension
+      (SLICE_BUILDER[:, 3], ValueError,
+       'Cannot index into an inner ragged dimension'),
+      (SLICE_BUILDER[:1, 3], ValueError,
+       'Cannot index into an inner ragged dimension'),
+      (SLICE_BUILDER[..., 3], ValueError,
+       'Cannot index into an inner ragged dimension'),
+
+      # Tests for type errors
+      (SLICE_BUILDER[0.5], TypeError, re.escape(array_ops._SLICE_TYPE_ERROR)),
+      (SLICE_BUILDER[1:3:0.5], TypeError,
+       re.escape(array_ops._SLICE_TYPE_ERROR)),
+      (SLICE_BUILDER[:, 1:3:0.5], TypeError,
+       'slice strides must be integers or None'),
+      (SLICE_BUILDER[:, 0.5:1.5], TypeError,
+       'slice offsets must be integers or None'),
+      (SLICE_BUILDER['foo'], TypeError, re.escape(array_ops._SLICE_TYPE_ERROR)),
+      (SLICE_BUILDER[:, 'foo':'foo'], TypeError,
+       'slice offsets must be integers or None'),
+
+      # Tests for other errors
+      (SLICE_BUILDER[..., 0, 0, 0], IndexError,
+       'Too many indices for RaggedTensor'),
+  )
+  def testRaggedTensorGetItemErrorsWithRaggedRank1(self, slice_spec, expected,
+                                                   message):
+    """Test that rt.__getitem__(slice_spec) == expected."""
+    # Ragged tensor
+    rt = ragged.from_row_splits(EXAMPLE_RAGGED_TENSOR_2D_VALUES,
+                                EXAMPLE_RAGGED_TENSOR_2D_SPLITS)
+    # if sys.version_info[0] == 3:
+    #   message = 'must be str, not int'
+
+    with self.test_session():
+      self.assertEqual(rt.tolist(), EXAMPLE_RAGGED_TENSOR_2D)
+    self._TestGetItemException(rt, slice_spec, expected, message)
+
+  @parameterized.parameters(
+      # Tests for rt[index, index, ...]
+      (SLICE_BUILDER[2, 0], EXAMPLE_RAGGED_TENSOR_4D[2][0]),
+      (SLICE_BUILDER[2, 0, 1], EXAMPLE_RAGGED_TENSOR_4D[2][0][1]),
+      (SLICE_BUILDER[2, 0, 1, 1], EXAMPLE_RAGGED_TENSOR_4D[2][0][1][1]),
+      (SLICE_BUILDER[2, 0, 1:], EXAMPLE_RAGGED_TENSOR_4D[2][0][1:]),
+      (SLICE_BUILDER[2, 0, 1:, 1:], [[16], [18]]),
+      (SLICE_BUILDER[2, 0, :, 1], [14, 16, 18]),
+      (SLICE_BUILDER[2, 0, 1, :], EXAMPLE_RAGGED_TENSOR_4D[2][0][1]),
+
+      # Tests for rt[index, slice, ...]
+      (SLICE_BUILDER[0, :], EXAMPLE_RAGGED_TENSOR_4D[0]),
+      (SLICE_BUILDER[1, :], EXAMPLE_RAGGED_TENSOR_4D[1]),
+      (SLICE_BUILDER[0, :, :, 1], [[2, 4, 6], [8, 10, 12]]),
+      (SLICE_BUILDER[1, :, :, 1], []),
+      (SLICE_BUILDER[2, :, :, 1], [[14, 16, 18]]),
+      (SLICE_BUILDER[3, :, :, 1], [[20]]),
+
+      # Tests for rt[slice, slice, ...]
+      (SLICE_BUILDER[:, :], EXAMPLE_RAGGED_TENSOR_4D),
+      (SLICE_BUILDER[:, :, :, 1], [[[2, 4, 6], [8, 10, 12]], [], [[14, 16, 18]],
+                                   [[20]]]),
+      (SLICE_BUILDER[1:, :, :, 1], [[], [[14, 16, 18]], [[20]]]),
+      (SLICE_BUILDER[-3:, :, :, 1], [[], [[14, 16, 18]], [[20]]]),
+
+      # Test for ellipsis
+      (SLICE_BUILDER[...], EXAMPLE_RAGGED_TENSOR_4D),
+      (SLICE_BUILDER[2, ...], EXAMPLE_RAGGED_TENSOR_4D[2]),
+      (SLICE_BUILDER[2, 0, ...], EXAMPLE_RAGGED_TENSOR_4D[2][0]),
+      (SLICE_BUILDER[..., 0], [[[1, 3, 5], [7, 9, 11]], [], [[13, 15, 17]],
+                               [[19]]]),
+      (SLICE_BUILDER[2, ..., 0], [[13, 15, 17]]),
+      (SLICE_BUILDER[2, 0, ..., 0], [13, 15, 17]),
+
+      # Test for array_ops.newaxis
+      (SLICE_BUILDER[array_ops.newaxis, :], [EXAMPLE_RAGGED_TENSOR_4D]),
+      (SLICE_BUILDER[:, array_ops.newaxis],
+       [[row] for row in EXAMPLE_RAGGED_TENSOR_4D]),
+
+      # Empty slice spec.
+      ([], EXAMPLE_RAGGED_TENSOR_4D),
+
+      # Slicing inner ragged dimensions.
+      (SLICE_BUILDER[:, 1:4], [row[1:4] for row in EXAMPLE_RAGGED_TENSOR_4D]),
+      (SLICE_BUILDER[:, -2:], [row[-2:] for row in EXAMPLE_RAGGED_TENSOR_4D]),
+      (SLICE_BUILDER[:, :, :-1],
+       [[v[:-1] for v in row] for row in EXAMPLE_RAGGED_TENSOR_4D]),
+      (SLICE_BUILDER[:, :, 1:2],
+       [[v[1:2] for v in row] for row in EXAMPLE_RAGGED_TENSOR_4D]),
+      (SLICE_BUILDER[1:, 1:3, 1:2],
+       [[v[1:2] for v in row[1:3]] for row in EXAMPLE_RAGGED_TENSOR_4D[1:]]),
+
+      # Strided slices
+      (SLICE_BUILDER[::2], EXAMPLE_RAGGED_TENSOR_4D[::2]),
+      (SLICE_BUILDER[1::2], EXAMPLE_RAGGED_TENSOR_4D[1::2]),
+      (SLICE_BUILDER[:, ::2], [row[::2] for row in EXAMPLE_RAGGED_TENSOR_4D]),
+      (SLICE_BUILDER[:, 1::2], [row[1::2] for row in EXAMPLE_RAGGED_TENSOR_4D]),
+      (SLICE_BUILDER[:, :, ::2],
+       [[v[::2] for v in row] for row in EXAMPLE_RAGGED_TENSOR_4D]),
+      (SLICE_BUILDER[:, :, 1::2],
+       [[v[1::2] for v in row] for row in EXAMPLE_RAGGED_TENSOR_4D]),
+
+      # TODO(edloper): Add tests for strided slices, once support is added.
+      # TODO(edloper): Add tests slicing inner ragged dimensions, one support
+      # is added.
+  )
+  def testRaggedTensorGetItemWithRaggedRank2(self, slice_spec, expected):
+    """Test that rt.__getitem__(slice_spec) == expected."""
+    rt = ragged.from_nested_row_splits(
+        EXAMPLE_RAGGED_TENSOR_4D_VALUES,
+        [EXAMPLE_RAGGED_TENSOR_4D_SPLITS1, EXAMPLE_RAGGED_TENSOR_4D_SPLITS2])
+    with self.test_session():
+      self.assertEqual(rt.tolist(), EXAMPLE_RAGGED_TENSOR_4D)
+    self._TestGetItem(rt, slice_spec, expected)
+
+  @parameterized.parameters(
+      # Test for errors in unsupported cases
+      (SLICE_BUILDER[:, 0], ValueError,
+       'Cannot index into an inner ragged dimension.'),
+      (SLICE_BUILDER[:, :, 0], ValueError,
+       'Cannot index into an inner ragged dimension.'),
+
+      # Test for out-of-bounds errors.
+      (SLICE_BUILDER[1, 0], ValueError, '.*out of bounds.*'),
+      (SLICE_BUILDER[0, 0, 3], ValueError, '.*out of bounds.*'),
+      (SLICE_BUILDER[5], ValueError, '.*out of bounds.*'),
+      (SLICE_BUILDER[0, 5], ValueError, '.*out of bounds.*'),
+  )
+  def testRaggedTensorGetItemErrorsWithRaggedRank2(self, slice_spec, expected,
+                                                   message):
+    """Test that rt.__getitem__(slice_spec) == expected."""
+    rt = ragged.from_nested_row_splits(
+        EXAMPLE_RAGGED_TENSOR_4D_VALUES,
+        [EXAMPLE_RAGGED_TENSOR_4D_SPLITS1, EXAMPLE_RAGGED_TENSOR_4D_SPLITS2])
+    with self.test_session():
+      self.assertEqual(rt.tolist(), EXAMPLE_RAGGED_TENSOR_4D)
+    self._TestGetItemException(rt, slice_spec, expected, message)
+
+  @parameterized.parameters(
+      (SLICE_BUILDER[:], []),
+      (SLICE_BUILDER[2:], []),
+      (SLICE_BUILDER[:-3], []),
+  )
+  def testRaggedTensorGetItemWithEmptyTensor(self, slice_spec, expected):
+    """Test that rt.__getitem__(slice_spec) == expected."""
+    rt = ragged.from_row_splits([], [0])
+    self._TestGetItem(rt, slice_spec, expected)
+
+  @parameterized.parameters(
+      (SLICE_BUILDER[0], ValueError, '.*out of bounds.*'),
+      (SLICE_BUILDER[-1], ValueError, '.*out of bounds.*'),
+  )
+  def testRaggedTensorGetItemErrorsWithEmptyTensor(self, slice_spec, expected,
+                                                   message):
+    """Test that rt.__getitem__(slice_spec) == expected."""
+    rt = ragged.from_row_splits([], [0])
+    self._TestGetItemException(rt, slice_spec, expected, message)
+
+  @parameterized.parameters(
+      (SLICE_BUILDER[-4], EXAMPLE_RAGGED_TENSOR_2D[-4]),
+      (SLICE_BUILDER[0], EXAMPLE_RAGGED_TENSOR_2D[0]),
+      (SLICE_BUILDER[-3:], EXAMPLE_RAGGED_TENSOR_2D[-3:]),
+      (SLICE_BUILDER[:3], EXAMPLE_RAGGED_TENSOR_2D[:3]),
+      (SLICE_BUILDER[3:5], EXAMPLE_RAGGED_TENSOR_2D[3:5]),
+      (SLICE_BUILDER[0, 1], EXAMPLE_RAGGED_TENSOR_2D[0][1]),
+      (SLICE_BUILDER[-3, 0], EXAMPLE_RAGGED_TENSOR_2D[-3][0]),
+  )
+  def testRaggedTensorGetItemWithPlaceholderShapes(self, slice_spec, expected):
+    """Test that rt.__getitem__(slice_spec) == expected."""
+    # Intentionally use an unknown shape for `splits`, to force the code path
+    # that deals with having nrows unknown at graph construction time.
+    splits = constant_op.constant(
+        EXAMPLE_RAGGED_TENSOR_2D_SPLITS, dtype=dtypes.int64)
+    splits = array_ops.placeholder_with_default(splits, None)
+    rt = ragged.from_row_splits(EXAMPLE_RAGGED_TENSOR_2D_VALUES, splits)
+    with self.test_session():
+      self.assertEqual(rt.tolist(), EXAMPLE_RAGGED_TENSOR_2D)
+    self._TestGetItem(rt, slice_spec, expected)
+
+  @parameterized.parameters(
+      (SLICE_BUILDER[..., 2], ValueError,
+       'Ellipsis not supported for unknown shape RaggedTensors'),)
+  def testRaggedTensorGetItemErrorsWithPlaceholderShapes(
+      self, slice_spec, expected, message):
+    """Test that rt.__getitem__(slice_spec) == expected."""
+    # Intentionally use an unknown shape for `values`.
+    values = array_ops.placeholder_with_default([0], None)
+    rt = ragged.from_row_splits(values, [0, 1])
+    self._TestGetItemException(rt, slice_spec, expected, message)
+
+  # TODO(edloper): Remove this decorator once c shapes become the default.
+  @test_util.enable_c_shapes
+  def testGetItemNewAxis(self):
+    # rt: [[[['a', 'b'], ['c', 'd']], [], [['e', 'f']]], []]
+    splits1 = [0, 3, 3]
+    splits2 = [0, 2, 2, 3]
+    values = constant_op.constant([['a', 'b'], ['c', 'd'], ['e', 'f']])
+    rt = ragged.from_nested_row_splits(values, [splits1, splits2])
+    with self.test_session():
+      rt_newaxis0 = rt[array_ops.newaxis]
+      rt_newaxis1 = rt[:, array_ops.newaxis]
+      rt_newaxis2 = rt[:, :, array_ops.newaxis]
+      rt_newaxis3 = rt[:, :, :, array_ops.newaxis]
+      rt_newaxis4 = rt[:, :, :, :, array_ops.newaxis]
+
+      self.assertEqual(rt.tolist(),
+                       [[[[b'a', b'b'], [b'c', b'd']], [], [[b'e', b'f']]], []])
+      self.assertEqual(
+          rt_newaxis0.tolist(),
+          [[[[[b'a', b'b'], [b'c', b'd']], [], [[b'e', b'f']]], []]])
+      self.assertEqual(
+          rt_newaxis1.tolist(),
+          [[[[[b'a', b'b'], [b'c', b'd']], [], [[b'e', b'f']]]], [[]]])
+      self.assertEqual(
+          rt_newaxis2.tolist(),
+          [[[[[b'a', b'b'], [b'c', b'd']]], [[]], [[[b'e', b'f']]]], []])
+      self.assertEqual(
+          rt_newaxis3.tolist(),
+          [[[[[b'a', b'b']], [[b'c', b'd']]], [], [[[b'e', b'f']]]], []])
+      self.assertEqual(
+          rt_newaxis4.tolist(),
+          [[[[[b'a'], [b'b']], [[b'c'], [b'd']]], [], [[[b'e'], [b'f']]]], []])
+
+      self.assertEqual(rt.ragged_rank, 2)
+      self.assertEqual(rt_newaxis0.ragged_rank, 3)
+      self.assertEqual(rt_newaxis1.ragged_rank, 3)
+      self.assertEqual(rt_newaxis2.ragged_rank, 3)
+      self.assertEqual(rt_newaxis3.ragged_rank, 2)
+      self.assertEqual(rt_newaxis4.ragged_rank, 2)
+
+      self.assertEqual(rt_newaxis0.shape.as_list(), [1, None, None, None, 2])
+      self.assertEqual(rt_newaxis1.shape.as_list(), [2, None, None, None, 2])
+      self.assertEqual(rt_newaxis2.shape.as_list(), [2, None, None, None, 2])
+      self.assertEqual(rt_newaxis3.shape.as_list(), [2, None, None, 1, 2])
+      self.assertEqual(rt_newaxis4.shape.as_list(), [2, None, None, 2, 1])
+
+  #=============================================================================
+  # RaggedTensor.__str__
+  #=============================================================================
+  def testRaggedTensorStr(self):
+    rt1 = ragged.from_row_splits(b'a b c d e f g'.split(), [0, 2, 5, 6, 6, 7])
+    expected1 = ('RaggedTensor(values=Tensor("RaggedFromRowSplits/values:0", '
+                 'shape=(7,), dtype=string), row_splits='
+                 'Tensor("RaggedFromRowSplits/row_splits:0", '
+                 'shape=(6,), dtype=int64))')
+    self.assertEqual(str(rt1), expected1)
+    self.assertEqual(repr(rt1), expected1)
+
+  def testRaggedTensorValueStr(self):
+    rt = ragged.RaggedTensorValue(
+        values=np.array(b'a b c d e f g'.split()),
+        row_splits=np.array([0, 2, 5, 6, 6, 7], dtype=np.int64))
+    if sys.version_info[0] == 2:
+      self.assertEqual(' '.join(str(rt).split()),
+                       (r"<RaggedTensorValue [['a', 'b'], ['c', 'd', 'e'], "
+                        "['f'], [], ['g']]>"))
+      self.assertEqual(
+          ' '.join(repr(rt).split()),
+          (r"RaggedTensorValue(values=array(['a', 'b', 'c', 'd', "
+           "'e', 'f', 'g'], dtype='|S1'), row_splits=array([0, 2, 5,"
+           ' 6, 6, 7]))'))
+    else:
+      self.assertEqual(
+          ' '.join(str(rt).split()),
+          (r"<RaggedTensorValue [[b'a', b'b'], [b'c', b'd', b'e'], "
+           "[b'f'], [], [b'g']]>"))
+      self.assertEqual(
+          ' '.join(repr(rt).split()),
+          (r"RaggedTensorValue(values=array([b'a', b'b', b'c', b'd', "
+           "b'e', b'f', b'g'], dtype='|S1'), row_splits=array([0, 2, 5,"
+           ' 6, 6, 7]))'))
+
+  #=============================================================================
+  # RaggedTensor.with_values() and RaggedTensor.with_inner_values().
+  #=============================================================================
+
+  def testWithValues(self):
+    rt1 = ragged.constant([[1, 2], [3, 4, 5], [6], [], [7]])
+    rt2 = ragged.constant([[[1, 2], [3, 4, 5]], [[6]], [], [[], [7]]])
+
+    rt1_plus_10 = rt1.with_values(rt1.values + 10)
+    rt2_times_10 = rt2.with_inner_values(rt2.inner_values * 10)
+    rt1_expanded = rt1.with_values(array_ops.expand_dims(rt1.values, axis=1))
+
+    with self.test_session():
+      self.assertEqual(rt1_plus_10.tolist(),
+                       [[11, 12], [13, 14, 15], [16], [], [17]])
+      self.assertEqual(rt2_times_10.tolist(),
+                       [[[10, 20], [30, 40, 50]], [[60]], [], [[], [70]]])
+      self.assertEqual(rt1_expanded.tolist(),
+                       [[[1], [2]], [[3], [4], [5]], [[6]], [], [[7]]])
+
+  #=============================================================================
+  # Session.run
+  #=============================================================================
+  def testSessionRun(self):
+    rt1 = ragged.constant([[1, 2, 3], [4]])
+    rt2 = ragged.constant([[[], [1, 2]], [[3]]])
+    with self.test_session() as session:
+      result = session.run({'rt1': rt1, 'rt2': rt2})
+      self.assertCountEqual(sorted(result.keys()), ['rt1', 'rt2'])
+      self.assertEqual(result['rt1'].tolist(), [[1, 2, 3], [4]])
+      self.assertEqual(result['rt2'].tolist(), [[[], [1, 2]], [[3]]])
+
+  def testSessionRunFeed(self):
+    rt1 = ragged.from_row_splits(
+        array_ops.placeholder(dtypes.int32),
+        array_ops.placeholder(dtypes.int64))
+    rt2 = ragged.from_nested_row_splits(
+        array_ops.placeholder(dtypes.int32),
+        [array_ops.placeholder(dtypes.int64),
+         array_ops.placeholder(dtypes.int64)])
+
+    rt1_feed_val = ragged.constant_value([[1, 2, 3], [4]])
+    rt2_feed_val = ragged.constant_value([[[], [1, 2]], [[3]]])
+
+    with self.test_session() as session:
+      result = session.run({'rt1': rt1, 'rt2': rt2},
+                           feed_dict={rt1: rt1_feed_val,
+                                      rt2: rt2_feed_val})
+      self.assertCountEqual(sorted(result.keys()), ['rt1', 'rt2'])
+      self.assertEqual(result['rt1'].tolist(), [[1, 2, 3], [4]])
+      self.assertEqual(result['rt2'].tolist(), [[[], [1, 2]], [[3]]])
+
+  def testSessionPartialRunFeed(self):
+    # Placeholder inputs.
+    a = ragged.from_row_splits(
+        array_ops.placeholder(dtypes.int32, shape=[None], name='a.values'),
+        array_ops.placeholder(dtypes.int64, name='a.row_splits'))
+    b = ragged.from_row_splits(
+        array_ops.placeholder(dtypes.int32, shape=[None], name='b.values'),
+        array_ops.placeholder(dtypes.int64, name='b.row_splits'))
+    c = array_ops.placeholder(dtypes.int32, shape=[], name='c')
+
+    # Feed values for placeholder inputs.
+    a_val = ragged.constant_value([[1, 2, 3], [4]])
+    b_val = ragged.constant_value([[5, 4, 3], [2]])
+    c_val = 3
+
+    # Compute some values.
+    r1 = ragged.reduce_sum(a * b, axis=1)
+    r2 = ragged.reduce_sum(a + c, axis=1)
+
+    with self.test_session() as session:
+      handle = session.partial_run_setup([r1, r2], [a, b, c])
+
+      res1 = session.partial_run(handle, r1, feed_dict={a: a_val, b: b_val})
+      self.assertEqual(res1.tolist(), [22, 8])
+
+      res2 = session.partial_run(handle, r2, feed_dict={c: c_val})
+      self.assertEqual(res2.tolist(), [15, 7])
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/python/ops/ragged/ragged_tensor_value.py b/tensorflow/python/ops/ragged/ragged_tensor_value.py
new file mode 100644
index 00000000000..39d3249c991
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_tensor_value.py
@@ -0,0 +1,97 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Value for RaggedTensor."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+
+class RaggedTensorValue(object):
+  """Represents the value of a `RaggedTensor`.
+
+  See `RaggedTensor` for a description of ragged tensors.
+  """
+
+  def __init__(self, values, row_splits):
+    """Creates a `RaggedTensorValue`.
+
+    Args:
+      values: A numpy array of any type and shape; or a RaggedTensorValue.
+      row_splits: A 1-D int64 numpy array.
+    """
+    if not (isinstance(row_splits, (np.ndarray, np.generic)) and
+            row_splits.dtype == np.int64 and row_splits.ndim == 1):
+      raise TypeError("row_splits must be a 1D int64 numpy array")
+    if not isinstance(values, (np.ndarray, np.generic, RaggedTensorValue)):
+      raise TypeError("values must be a numpy array or a RaggedTensorValue")
+    self._values = values
+    self._row_splits = row_splits
+
+  row_splits = property(
+      lambda self: self._row_splits,
+      doc="""The split indices for the ragged tensor value.""")
+  values = property(
+      lambda self: self._values,
+      doc="""The concatenated values for all rows in this tensor.""")
+  dtype = property(
+      lambda self: self._values.dtype,
+      doc="""The numpy dtype of values in this tensor.""")
+
+  @property
+  def inner_values(self):
+    """The innermost `values` array for this ragged tensor value."""
+    rt_values = self.values
+    while isinstance(rt_values, RaggedTensorValue):
+      rt_values = rt_values.values
+    return rt_values
+
+  @property
+  def nested_row_splits(self):
+    """The row_splits for all ragged dimensions in this ragged tensor value."""
+    rt_nested_splits = [self.row_splits]
+    rt_values = self.values
+    while isinstance(rt_values, RaggedTensorValue):
+      rt_nested_splits.append(rt_values.row_splits)
+      rt_values = rt_values.values
+    return tuple(rt_nested_splits)
+
+  @property
+  def ragged_rank(self):
+    """The number of ragged dimensions in this ragged tensor value."""
+    values_is_ragged = isinstance(self._values, RaggedTensorValue)
+    return self._values.ragged_rank + 1 if values_is_ragged else 1
+
+  @property
+  def shape(self):
+    """A tuple indicating the shape of this RaggedTensorValue."""
+    return (self._row_splits.shape[0] - 1,) + (None,) + self._values.shape[1:]
+
+  def __str__(self):
+    return "<RaggedTensorValue %s>" % self.tolist()
+
+  def __repr__(self):
+    return "RaggedTensorValue(values=%r, row_splits=%r)" % (self._values,
+                                                            self._row_splits)
+
+  def tolist(self):
+    """Returns this ragged tensor value as a nested Python list."""
+    values_as_list = self._values.tolist()
+    return [
+        values_as_list[self._row_splits[i]:self._row_splits[i + 1]]
+        for i in range(len(self._row_splits) - 1)
+    ]
diff --git a/tensorflow/python/ops/ragged/ragged_tile_op_test.py b/tensorflow/python/ops/ragged/ragged_tile_op_test.py
new file mode 100644
index 00000000000..bf62d96e7a9
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_tile_op_test.py
@@ -0,0 +1,215 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for ragged.tile."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops.ragged import ragged_array_ops
+from tensorflow.python.ops.ragged import ragged_factory_ops
+from tensorflow.python.platform import googletest
+
+
+class RaggedTileOpTest(test_util.TensorFlowTestCase, parameterized.TestCase):
+
+  @parameterized.parameters([
+      #=========================================================================
+      # Docstring Example
+      #=========================================================================
+      dict(
+          descr='docstring example: ragged_rank=1, repeat axes 0 and 1',
+          rt_input=[[1, 2], [3]],
+          multiples=[3, 2],
+          expected=[
+              [1, 2, 1, 2], [3, 3], [1, 2, 1, 2], [3, 3], [1, 2, 1, 2], [3, 3]],
+      ),
+
+      #=========================================================================
+      # rank=3, ragged_rank=2
+      #=========================================================================
+      dict(
+          descr='rank=3, ragged_rank=2, repeat axis 0',
+          rt_input=[[[1, 2], [3]], [], [[4]]],
+          multiples=[2, 1, 1],
+          expected=[[[1, 2], [3]], [], [[4]],
+                    [[1, 2], [3]], [], [[4]]]),
+      dict(
+          descr='rank=3, ragged_rank=2, repeat axis 1',
+          rt_input=[[[1, 2], [3]], [], [[4]]],
+          multiples=[1, 2, 1],
+          expected=[[[1, 2], [3], [1, 2], [3]], [], [[4], [4]]]),
+      dict(
+          descr='rank=3, ragged_rank=2, repeat axis 2',
+          rt_input=[[[1, 2], [3]], [], [[4]]],
+          multiples=[1, 1, 2],
+          expected=[[[1, 2, 1, 2], [3, 3]], [], [[4, 4]]]),
+      dict(
+          descr='rank=3, ragged_rank=2, repeat axes 0 and 1',
+          rt_input=[[[1, 2], [3]], [], [[4]]],
+          multiples=[2, 2, 1],
+          expected=[[[1, 2], [3], [1, 2], [3]], [], [[4], [4]],
+                    [[1, 2], [3], [1, 2], [3]], [], [[4], [4]]]),
+      dict(
+          descr='rank=3, ragged_rank=2, repeat axes 0 and 2',
+          rt_input=[[[1, 2], [3]], [], [[4]]],
+          multiples=[2, 1, 2],
+          expected=[[[1, 2, 1, 2], [3, 3]], [], [[4, 4]],
+                    [[1, 2, 1, 2], [3, 3]], [], [[4, 4]]]),
+      dict(
+          descr='rank=3, ragged_rank=2, repeat axes 1 and 2',
+          rt_input=[[[1, 2], [3]], [], [[4]]],
+          multiples=[1, 2, 2],
+          expected=[[[1, 2, 1, 2], [3, 3], [1, 2, 1, 2], [3, 3]],
+                    [], [[4, 4], [4, 4]]]),
+      dict(
+          descr='rank=3, ragged_rank=2, repeat all axes',
+          rt_input=[[['a', 'b'], ['c']], [], [['d']]],
+          multiples=[4, 3, 2],
+          expected=[[[b'a', b'b']*2, [b'c']*2]*3, []*3, [[b'd']*2]*3]*4),
+      #=========================================================================
+      # rank=3, ragged_rank=1
+      #=========================================================================
+      dict(
+          descr='rank=3, ragged_rank=1, repeat axis 0',
+          ragged_rank=1,
+          rt_input=[[[1, 2], [3, 4]], [], [[5, 6]]],
+          multiples=[2, 1, 1],
+          expected=[[[1, 2], [3, 4]], [], [[5, 6]],
+                    [[1, 2], [3, 4]], [], [[5, 6]]]),
+      dict(
+          descr='rank=3, ragged_rank=1, repeat axis 1',
+          ragged_rank=1,
+          rt_input=[[[1, 2], [3, 4]], [], [[5, 6]]],
+          multiples=[1, 2, 1],
+          expected=[[[1, 2], [3, 4], [1, 2], [3, 4]], [], [[5, 6], [5, 6]]]),
+      dict(
+          descr='rank=3, ragged_rank=1, repeat axis 2',
+          ragged_rank=1,
+          rt_input=[[[1, 2], [3, 4]], [], [[5, 6]]],
+          multiples=[1, 1, 2],
+          expected=[[[1, 2, 1, 2], [3, 4, 3, 4]], [], [[5, 6, 5, 6]]]),
+      #=========================================================================
+      # rank=4, ragged_rank=3
+      #=========================================================================
+      dict(
+          descr='rank=4, ragged_rank=3, repeat axis 0',
+          rt_input=[[[[1], [2]], [[3]]], [[]], [[[4, 5]]]],
+          multiples=[2, 1, 1, 1],
+          expected=[[[[1], [2]], [[3]]], [[]], [[[4, 5]]],
+                    [[[1], [2]], [[3]]], [[]], [[[4, 5]]]]),
+      dict(
+          descr='rank=4, ragged_rank=3, repeat axis 1',
+          rt_input=[[[[1], [2]], [[3]]], [[]], [[[4, 5]]]],
+          multiples=[1, 2, 1, 1],
+          expected=[[[[1], [2]], [[3]], [[1], [2]], [[3]]],
+                    [[], []],
+                    [[[4, 5]], [[4, 5]]]]),
+      dict(
+          descr='rank=4, ragged_rank=3, repeat axis 2',
+          rt_input=[[[[1], [2]], [[3]]], [[]], [[[4, 5]]]],
+          multiples=[1, 1, 2, 1],
+          expected=[[[[1], [2], [1], [2]], [[3], [3]]],
+                    [[]],
+                    [[[4, 5], [4, 5]]]]),
+      dict(
+          descr='rank=4, ragged_rank=3, repeat axis 3',
+          rt_input=[[[[1], [2]], [[3]]], [[]], [[[4, 5]]]],
+          multiples=[1, 1, 1, 2],
+          expected=[[[[1, 1], [2, 2]], [[3, 3]]], [[]], [[[4, 5, 4, 5]]]]),
+      dict(
+          descr='rank=4, ragged_rank=3, repeat all axes',
+          rt_input=[[[['a'], ['b']], [['c']]], [[]], [[['d', 'e']]]],
+          multiples=[5, 4, 3, 2],
+          expected=[[[[b'a']*2, [b'b']*2]*3, [[b'c']*2]*3]*4,
+                    [[]*3]*4,
+                    [[[b'd', b'e']*2]*3]*4]*5),
+      dict(
+          descr='rank=5, ragged_rank=4, repeat all axes',
+          rt_input=[[[[['a']]]]],
+          multiples=[6, 5, 4, 3, 2],
+          expected=[[[[[b'a']*2]*3]*4]*5]*6),
+      #=========================================================================
+      # multiple=0
+      #=========================================================================
+      dict(
+          descr='rank=4, ragged_rank=3, repeat axis 0 (multiple=0)',
+          rt_input=[[[[1], [2]], [[3]]], [[]], [[[4, 5]]]],
+          multiples=[0, 1, 1, 1],
+          expected=[]),
+      dict(
+          descr='rank=4, ragged_rank=3, repeat axis 1 (multiple=0)',
+          rt_input=[[[[1], [2]], [[3]]], [[]], [[[4, 5]]]],
+          multiples=[1, 0, 1, 1],
+          expected=[[], [], []]),
+      dict(
+          descr='rank=4, ragged_rank=3, repeat axis 2 (multiple=0)',
+          rt_input=[[[[1], [2]], [[3]]], [[]], [[[4, 5]]]],
+          multiples=[1, 1, 0, 1],
+          expected=[[[], []], [[]], [[]]]),
+      dict(
+          descr='rank=4, ragged_rank=3, repeat axis 3 (multiple=0)',
+          rt_input=[[[[1], [2]], [[3]]], [[]], [[[4, 5]]]],
+          multiples=[1, 1, 1, 0],
+          expected=[[[[], []], [[]]], [[]], [[[]]]]),
+
+  ])  # pyformat: disable
+  def testRaggedTile(self,
+                     descr,
+                     rt_input,
+                     multiples,
+                     expected,
+                     ragged_rank=None):
+    rt = ragged_factory_ops.constant(rt_input, ragged_rank)
+
+    expected_shape = [
+        None if dim is None else dim * multiple
+        for (dim, multiple) in zip(rt.shape.as_list(), multiples)
+    ]
+
+    # Test with both const & non-const multiples: ragged_tile has a few code
+    # paths that optimize the case where multiples[d] is known to be 1.
+    const_multiples = constant_op.constant(multiples, dtypes.int64)
+    non_const_multiples = array_ops.placeholder_with_default(
+        const_multiples, shape=[len(multiples)])
+
+    for multiples_tensor in (const_multiples, non_const_multiples):
+      tiled = ragged_array_ops.tile(rt, multiples_tensor)
+      self.assertEqual(tiled.ragged_rank, rt.ragged_rank)
+      self.assertEqual(tiled.shape.ndims, rt.shape.ndims)
+      if multiples_tensor is const_multiples:
+        self.assertEqual(tiled.shape.as_list(), expected_shape)
+      with self.test_session():
+        self.assertEqual(tiled.eval().tolist(), expected)
+
+  def testRaggedTileWithTensorInput(self):
+    # When the input is a `Tensor`, ragged_tile just delegates to tf.tile.
+    dt = constant_op.constant([[1, 2], [3, 4]])
+    tiled = ragged_array_ops.tile(dt, [3, 2])
+    expected = [[1, 2, 1, 2], [3, 4, 3, 4],
+                [1, 2, 1, 2], [3, 4, 3, 4],
+                [1, 2, 1, 2], [3, 4, 3, 4]]  # pyformat: disable
+    with self.test_session():
+      self.assertEqual(tiled.eval().tolist(), expected)
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/python/ops/ragged/ragged_to_sparse_op_test.py b/tensorflow/python/ops/ragged/ragged_to_sparse_op_test.py
new file mode 100644
index 00000000000..2fd31837c62
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_to_sparse_op_test.py
@@ -0,0 +1,193 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for ragged.to_sparse op."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gradients_impl
+from tensorflow.python.ops import ragged
+from tensorflow.python.platform import googletest
+
+
+class RaggedTensorToSparseOpTest(test_util.TensorFlowTestCase):
+
+  def testDocStringExample(self):
+    rt = ragged.constant([[1, 2, 3], [4], [], [5, 6]])
+    st = ragged.to_sparse(rt)
+    expected = ('SparseTensorValue(indices='
+                'array([[0, 0], [0, 1], [0, 2], [1, 0], [3, 0], [3, 1]]), '
+                'values=array([1, 2, 3, 4, 5, 6], dtype=int32), '
+                'dense_shape=array([4, 3]))')
+    with self.test_session():
+      self.assertEqual(' '.join(repr(st.eval()).split()), expected)
+
+  def test2DRaggedTensorWithOneRaggedDimension(self):
+    rt = ragged.constant([['a', 'b'], ['c', 'd', 'e'], ['f'], [], ['g']])
+    with self.test_session():
+      st = ragged.to_sparse(rt).eval()
+      self.assertAllEqual(
+          st.indices, [[0, 0], [0, 1], [1, 0], [1, 1], [1, 2], [2, 0], [4, 0]])
+      self.assertAllEqual(st.values, b'a b c d e f g'.split())
+      self.assertAllEqual(st.dense_shape, [5, 3])
+
+  def test3DRaggedTensorWithOneRaggedDimension(self):
+    rt = ragged.constant([[[1, 2], [3, 4]], [[5, 6], [7, 8], [9, 10]],
+                          [[11, 12]], [], [[13, 14]]],
+                         ragged_rank=1)
+    with self.test_session():
+      st = ragged.to_sparse(rt).eval()
+      self.assertAllEqual(
+          st.indices, [[0, 0, 0], [0, 0, 1], [0, 1, 0], [0, 1, 1], [1, 0, 0],
+                       [1, 0, 1], [1, 1, 0], [1, 1, 1], [1, 2, 0], [1, 2, 1],
+                       [2, 0, 0], [2, 0, 1], [4, 0, 0], [4, 0, 1]])
+      self.assertAllEqual(st.values,
+                          [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14])
+      self.assertAllEqual(st.dense_shape, [5, 3, 2])
+
+  def test4DRaggedTensorWithOneRaggedDimension(self):
+    rt = ragged.constant(
+        [[[[1, 2], [3, 4]], [[5, 6], [7, 8]]], [], [[[9, 10], [11, 12]]]],
+        ragged_rank=1)
+    with self.test_session():
+      st = ragged.to_sparse(rt).eval()
+      self.assertAllEqual(st.values, [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12])
+      self.assertAllEqual(
+          st.indices,
+          [
+              [0, 0, 0, 0],  # index for value=1
+              [0, 0, 0, 1],  # index for value=2
+              [0, 0, 1, 0],  # index for value=3
+              [0, 0, 1, 1],  # index for value=4
+              [0, 1, 0, 0],  # index for value=5
+              [0, 1, 0, 1],  # index for value=6
+              [0, 1, 1, 0],  # index for value=7
+              [0, 1, 1, 1],  # index for value=8
+              [2, 0, 0, 0],  # index for value=9
+              [2, 0, 0, 1],  # index for value=10
+              [2, 0, 1, 0],  # index for value=11
+              [2, 0, 1, 1],  # index for value=12
+          ])
+      self.assertAllEqual(st.dense_shape, [3, 2, 2, 2])
+
+  def test4DRaggedTensorWithTwoRaggedDimensions(self):
+    rt = ragged.constant([[[[1, 2], [3, 4]], [[5, 6], [7, 8], [9, 10]]],
+                          [[[11, 12]], [], [[13, 14]]], []],
+                         ragged_rank=2)
+    with self.test_session():
+      st = ragged.to_sparse(rt).eval()
+      self.assertAllEqual(
+          st.indices,
+          [
+              [0, 0, 0, 0],  # index for value=1
+              [0, 0, 0, 1],  # index for value=2
+              [0, 0, 1, 0],  # index for value=3
+              [0, 0, 1, 1],  # index for value=4
+              [0, 1, 0, 0],  # index for value=5
+              [0, 1, 0, 1],  # index for value=6
+              [0, 1, 1, 0],  # index for value=7
+              [0, 1, 1, 1],  # index for value=8
+              [0, 1, 2, 0],  # index for value=9
+              [0, 1, 2, 1],  # index for value=10
+              [1, 0, 0, 0],  # index for value=11
+              [1, 0, 0, 1],  # index for value=12
+              [1, 2, 0, 0],  # index for value=13
+              [1, 2, 0, 1],  # index for value=14
+          ])
+      self.assertAllEqual(st.values,
+                          [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14])
+      self.assertAllEqual(st.dense_shape, [3, 3, 3, 2])
+
+  def testShape(self):
+    rt = ragged.constant([[1, 2], [3, 4, 5], [6], [], [7]])
+    st = ragged.to_sparse(rt)
+    self.assertEqual(st.indices.shape.as_list(), [7, 2])
+    self.assertEqual(st.values.shape.as_list(), [7])
+    self.assertEqual(st.dense_shape.shape.as_list(), [2])
+
+    rt = ragged.constant([[[1, 2]], [], [[3, 4]], []], ragged_rank=1)
+    st = ragged.to_sparse(rt)
+    self.assertEqual(st.indices.shape.as_list(), [4, 3])
+    self.assertEqual(st.values.shape.as_list(), [4])
+    self.assertEqual(st.dense_shape.shape.as_list(), [3])
+
+    rt = ragged.constant([[[1], [2, 3, 4, 5, 6, 7]], [[]]])
+    st = ragged.to_sparse(rt)
+    self.assertEqual(st.indices.shape.as_list(), [7, 3])
+    self.assertEqual(st.values.shape.as_list(), [7])
+    self.assertEqual(st.dense_shape.shape.as_list(), [3])
+
+  def testKernelErrors(self):
+    # An empty vector, defined using a placeholder to ensure that we can't
+    # determine that it's invalid at graph-construction time.
+    empty_vector = array_ops.placeholder_with_default(
+        array_ops.zeros([0], dtypes.int64), shape=None)
+
+    bad_rt1 = ragged.from_row_splits(row_splits=[2, 3], values=[1, 2, 3])
+    with self.test_session():
+      bad_split0_error = r'First value of ragged splits must be 0.*'
+      self.assertRaisesRegexp(errors.InvalidArgumentError, bad_split0_error,
+                              ragged.to_sparse(bad_rt1).eval)
+
+    bad_rt2 = ragged.from_row_splits(row_splits=[0, 5], values=empty_vector)
+    bad_rt3 = ragged.from_row_splits(
+        row_splits=[0, 1],
+        values=ragged.from_row_splits(row_splits=[0, 5], values=empty_vector))
+    with self.test_session():
+      split_mismatch1_error = r'Final value of ragged splits must match.*'
+      for rt in [bad_rt2, bad_rt3]:
+        self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                split_mismatch1_error,
+                                ragged.to_sparse(rt).eval)
+
+    bad_rt4 = ragged.from_row_splits(
+        row_splits=[0, 5],
+        values=ragged.from_row_splits(row_splits=[0], values=empty_vector))
+    with self.test_session():
+      split_mismatch2_error = r'Final value of ragged splits must match.*'
+      self.assertRaisesRegexp(errors.InvalidArgumentError,
+                              split_mismatch2_error,
+                              ragged.to_sparse(bad_rt4).eval)
+
+    bad_rt5 = ragged.from_row_splits(row_splits=empty_vector, values=[])
+    with self.test_session():
+      empty_splits_error = (r'ragged splits may not be empty.*')
+      self.assertRaisesRegexp(errors.InvalidArgumentError, empty_splits_error,
+                              ragged.to_sparse(bad_rt5).eval)
+
+  def testGradient(self):
+    # rt1.shape == rt2.shape == [2, (D2), (D3), 2].
+    rt1 = ragged.constant([[[[1.0, 2.0], [3.0, 4.0]], [[5.0, 6.0]]]],
+                          ragged_rank=2)
+    rt2 = ragged.constant([[[[9.0, 8.0], [7.0, 6.0]], [[5.0, 4.0]]]],
+                          ragged_rank=2)
+    rt = rt1 + rt2 * 2.0
+    st = ragged.to_sparse(rt)
+
+    g1, g2 = gradients_impl.gradients(st.values, [rt1.inner_values,
+                                                  rt2.inner_values])
+    print(g1, g2)
+    with self.test_session():
+      self.assertEqual(g1.eval().tolist(), [[1.0, 1.0], [1.0, 1.0], [1.0, 1.0]])
+      self.assertEqual(g2.eval().tolist(), [[2.0, 2.0], [2.0, 2.0], [2.0, 2.0]])
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/python/ops/ragged/ragged_to_tensor_op_test.py b/tensorflow/python/ops/ragged/ragged_to_tensor_op_test.py
new file mode 100644
index 00000000000..0ccc214a9c7
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_to_tensor_op_test.py
@@ -0,0 +1,123 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for ragged.to_tensor."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import ragged
+from tensorflow.python.platform import googletest
+
+
+class RaggedTensorToTensorOpTest(test_util.TensorFlowTestCase,
+                                 parameterized.TestCase):
+
+  def testDocStringExamples(self):
+    """Example from ragged_to_tensor.__doc__."""
+    rt = ragged.constant([[9, 8, 7], [], [6, 5], [4]])
+    dt = ragged.to_tensor(rt)
+    with self.test_session():
+      self.assertEqual(str(dt.eval()),
+                       '[[9 8 7]\n'
+                       ' [0 0 0]\n'
+                       ' [6 5 0]\n'
+                       ' [4 0 0]]')  # pyformat: disable
+
+  @parameterized.parameters(
+      {
+          'rt_input': [],
+          'ragged_rank': 1,
+          'expected': [],
+          'expected_shape': [0, 0],
+      },
+      {
+          'rt_input': [[1, 2, 3], [], [4], [5, 6]],
+          'expected': [[1, 2, 3], [0, 0, 0], [4, 0, 0], [5, 6, 0]]
+      },
+      {
+          'rt_input': [[1, 2, 3], [], [4], [5, 6]],
+          'default': 9,
+          'expected': [[1, 2, 3], [9, 9, 9], [4, 9, 9], [5, 6, 9]]
+      },
+      {
+          'rt_input': [[[1], [2], [3]], [], [[4]], [[5], [6]]],
+          'ragged_rank':
+              1,
+          'default': [9],
+          'expected': [[[1], [2], [3]], [[9], [9], [9]], [[4], [9], [9]],
+                       [[5], [6], [9]]]
+      },
+      {
+          'rt_input': [[[1, 2], [], [3, 4]], [], [[5]], [[6, 7], [8]]],
+          'expected': [
+              [[1, 2], [0, 0], [3, 4]],  #
+              [[0, 0], [0, 0], [0, 0]],  #
+              [[5, 0], [0, 0], [0, 0]],  #
+              [[6, 7], [8, 0], [0, 0]]
+          ]  #
+      },
+  )
+  def testRaggedTensorToTensor(self,
+                               rt_input,
+                               expected,
+                               ragged_rank=None,
+                               default=None,
+                               expected_shape=None):
+    rt = ragged.constant(rt_input, ragged_rank=ragged_rank)
+    dt = ragged.to_tensor(rt, default)
+    self.assertEqual(type(dt), ops.Tensor)
+    self.assertEqual(rt.dtype, dt.dtype)
+    self.assertTrue(dt.shape.is_compatible_with(rt.shape))
+    with self.test_session():
+      self.assertEqual(dt.eval().tolist(), expected)
+      if expected_shape is not None:
+        dt_shape = array_ops.shape(dt)
+        self.assertEqual(dt_shape.eval().tolist(), expected_shape)
+
+  @parameterized.parameters(
+      {
+          'rt_input': [[1, 2, 3]],
+          'default': [0],
+          'error': (ValueError, r'Shapes \(1,\) and \(\) are incompatible'),
+      },
+      {
+          'rt_input': [[[1], [2], [3]]],
+          'default': 0,
+          'error': (ValueError, r'Shapes \(\) and \(1,\) are incompatible'),
+      },
+      {
+          'rt_input': [[[[1], [2]], [], [[3]]]],
+          'default': 0,
+          'error': (ValueError, r'Shapes \(\) and \(1,\) are incompatible'),
+      },
+      {
+          'rt_input': [[1, 2, 3]],
+          'default': 'a',
+          'error': (TypeError, "Expected int32, got 'a' of type 'str' instead"),
+      },
+  )
+  def testError(self, rt_input, default, error):
+    rt = ragged.constant(rt_input)
+    self.assertRaisesRegexp(error[0], error[1], ragged.to_tensor, rt, default)
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/python/ops/ragged/ragged_util.py b/tensorflow/python/ops/ragged/ragged_util.py
new file mode 100644
index 00000000000..03f050de514
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_util.py
@@ -0,0 +1,231 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Private convenience functions for RaggedTensors.
+
+None of these methods are exposed in the main "ragged" package.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import math_ops
+
+
+def convert_to_int_tensor(tensor, name, dtype=dtypes.int32):
+  """Converts the given value to an integer Tensor."""
+  tensor = ops.convert_to_tensor(tensor, name=name, preferred_dtype=dtype)
+  if tensor.dtype.is_integer:
+    tensor = math_ops.cast(tensor, dtype)
+  else:
+    raise TypeError(
+        "%s must be an integer tensor; dtype=%s" % (name, tensor.dtype))
+  return tensor
+
+
+def get_positive_axis(axis, ndims):
+  """Validate an `axis` parameter, and normalize it to be positive.
+
+  If `ndims` is known (i.e., not `None`), then check that `axis` is in the
+  range `-ndims <= axis < ndims`, and return `axis` (if `axis >= 0`) or
+  `axis + ndims` (otherwise).
+  If `ndims` is not known, and `axis` is positive, then return it as-is.
+  If `ndims` is not known, and `axis` is negative, then report an error.
+
+  Args:
+    axis: An integer constant
+    ndims: An integer constant, or `None`
+
+  Returns:
+    The normalized `axis` value.
+
+  Raises:
+    ValueError: If `axis` is out-of-bounds, or if `axis` is negative and
+      `ndims is None`.
+  """
+  if not isinstance(axis, int):
+    raise TypeError("axis must be an int; got %s" % type(axis).__name__)
+  if ndims is not None:
+    if 0 <= axis < ndims:
+      return axis
+    elif -ndims <= axis < 0:
+      return axis + ndims
+    else:
+      raise ValueError(
+          "axis=%s out of bounds: expected %s<=axis<%s" % (axis, -ndims, ndims))
+  elif axis < 0:
+    raise ValueError("axis may only be negative if ndims is statically known.")
+  return axis
+
+
+def assert_splits_match(nested_splits_lists):
+  """Checks that the given splits lists are identical.
+
+  Performs static tests to ensure that the given splits lists are identical,
+  and returns a list of control dependency op tensors that check that they are
+  fully identical.
+
+  Args:
+    nested_splits_lists: A list of nested_splits_lists, where each split_list is
+      a list of `splits` tensors from a `RaggedTensor`, ordered from outermost
+      ragged dimension to innermost ragged dimension.
+
+  Returns:
+    A list of control dependency op tensors.
+  Raises:
+    ValueError: If the splits are not identical.
+  """
+  error_msg = "Inputs must have identical ragged splits"
+  for splits_list in nested_splits_lists:
+    if len(splits_list) != len(nested_splits_lists[0]):
+      raise ValueError(error_msg)
+  return [
+      check_ops.assert_equal(s1, s2, message=error_msg)
+      for splits_list in nested_splits_lists[1:]
+      for (s1, s2) in zip(nested_splits_lists[0], splits_list)
+  ]
+
+
+# This op is intended to exactly match the semantics of numpy.repeat, with
+# one exception: numpy.repeat has special (and somewhat non-intuitive) behavior
+# when axis is not specified.  Rather than implement that special behavior, we
+# simply make `axis` be a required argument.
+#
+# External (OSS) `tf.repeat` feature request:
+# https://github.com/tensorflow/tensorflow/issues/8246
+def repeat(data, repeats, axis, name=None):
+  """Repeats elements of `data`.
+
+  Args:
+    data: An `N`-dimensional tensor.
+    repeats: A 1-D integer tensor specifying how many times each element in
+      `axis` should be repeated.  `len(repeats)` must equal `data.shape[axis]`.
+      Supports broadcasting from a scalar value.
+    axis: `int`.  The axis along which to repeat values.  Must be less than
+      `max(N, 1)`.
+    name: A name for the operation.
+
+  Returns:
+    A tensor with `max(N, 1)` dimensions.  Has the same shape as `data`,
+    except that dimension `axis` has size `sum(repeats)`.
+
+  #### Examples:
+    ```python
+    >>> repeat(['a', 'b', 'c'], repeats=[3, 0, 2], axis=0)
+    ['a', 'a', 'a', 'c', 'c']
+    >>> repeat([[1, 2], [3, 4]], repeats=[2, 3], axis=0)
+    [[1, 2], [1, 2], [3, 4], [3, 4], [3, 4]]
+    >>> repeat([[1, 2], [3, 4]], repeats=[2, 3], axis=1)
+    [[1, 1, 2, 2, 2], [3, 3, 4, 4, 4]]
+    ```
+  """
+  if not isinstance(axis, int):
+    raise TypeError("axis must be an int; got %s" % type(axis).__name__)
+
+  with ops.name_scope(name, "Repeat", [data, repeats]):
+    data = ops.convert_to_tensor(data, name="data")
+    repeats = convert_to_int_tensor(repeats, name="repeats")
+    repeats.shape.with_rank_at_most(1)
+
+    # If `data` is a scalar, then upgrade it to a vector.
+    data = _with_nonzero_rank(data)
+    data_shape = array_ops.shape(data)
+
+    # If `axis` is negative, then convert it to a positive value.
+    axis = get_positive_axis(axis, data.shape.ndims)
+
+    # Check data Tensor shapes.
+    if repeats.shape.ndims == 1:
+      data.shape.dims[axis].assert_is_compatible_with(repeats.shape[0])
+
+    # If we know that `repeats` is a scalar, then we can just tile & reshape.
+    if repeats.shape.ndims == 0:
+      expanded = array_ops.expand_dims(data, axis + 1)
+      tiled = tile_one_dimension(expanded, axis + 1, repeats)
+      result_shape = array_ops.concat(
+          [data_shape[:axis], [-1], data_shape[axis + 1:]], axis=0)
+      return array_ops.reshape(tiled, result_shape)
+
+    # Broadcast the `repeats` tensor so rank(repeats) == axis + 1.
+    if repeats.shape.ndims != axis + 1:
+      repeats_shape = array_ops.shape(repeats)
+      repeats_ndims = array_ops.rank(repeats)
+      broadcast_shape = array_ops.concat(
+          [data_shape[:axis + 1 - repeats_ndims], repeats_shape], axis=0)
+      repeats = array_ops.broadcast_to(repeats, broadcast_shape)
+      repeats.set_shape([None] * (axis + 1))
+
+    # Create a "sequence mask" based on `repeats`, where slices across `axis`
+    # contain one `True` value for each repetition.  E.g., if
+    # `repeats = [3, 1, 2]`, then `mask = [[1, 1, 1], [1, 0, 0], [1, 1, 0]]`.
+    max_repeat = math_ops.maximum(0, math_ops.reduce_max(repeats))
+    mask = array_ops.sequence_mask(repeats, max_repeat)
+
+    # Add a new dimension around each value that needs to be repeated, and
+    # then tile that new dimension to match the maximum number of repetitions.
+    expanded = array_ops.expand_dims(data, axis + 1)
+    tiled = tile_one_dimension(expanded, axis + 1, max_repeat)
+
+    # Use `boolean_mask` to discard the extra repeated values.  This also
+    # flattens all dimensions up through `axis`.
+    masked = array_ops.boolean_mask(tiled, mask)
+
+    # Reshape the output tensor to add the outer dimensions back.
+    if axis == 0:
+      result = masked
+    else:
+      result_shape = array_ops.concat(
+          [data_shape[:axis], [-1], data_shape[axis + 1:]], axis=0)
+      result = array_ops.reshape(masked, result_shape)
+
+    # Preserve shape information.
+    if data.shape.ndims is not None:
+      new_axis_size = 0 if repeats.shape[0] == 0 else None
+      result.set_shape(data.shape[:axis].concatenate(
+          [new_axis_size]).concatenate(data.shape[axis + 1:]))
+
+    return result
+
+
+def tile_one_dimension(data, axis, multiple):
+  """Tiles a single dimension of a tensor."""
+  # Assumes axis is a nonnegative int.
+  if data.shape.ndims is not None:
+    multiples = [1] * data.shape.ndims
+    multiples[axis] = multiple
+  else:
+    ones = array_ops.ones(array_ops.rank(data), dtypes.int32)
+    multiples = array_ops.concat([ones[:axis], [multiple], ones[axis + 1:]],
+                                 axis=0)
+  return array_ops.tile(data, multiples)
+
+
+def _with_nonzero_rank(data):
+  """If `data` is scalar, then add a dimension; otherwise return as-is."""
+  if data.shape.ndims is not None:
+    if data.shape.ndims == 0:
+      return array_ops.stack([data])
+    else:
+      return data
+  else:
+    data_shape = array_ops.shape(data)
+    data_ndims = array_ops.rank(data)
+    return array_ops.reshape(
+        data,
+        array_ops.concat([[1], data_shape], axis=0)[-data_ndims:])
diff --git a/tensorflow/python/ops/ragged/ragged_util_test.py b/tensorflow/python/ops/ragged/ragged_util_test.py
new file mode 100644
index 00000000000..c24ea653531
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_util_test.py
@@ -0,0 +1,228 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for ragged_util."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops.ragged import ragged_util
+from tensorflow.python.platform import googletest
+
+# Example 3d tensor for test cases.  Has shape [4, 2, 3].
+TENSOR_3D = [[[('%d%d%d' % (i, j, k)).encode('utf-8')
+               for k in range(3)]
+              for j in range(2)]
+             for i in range(4)]
+
+# Example 4d tensor for test cases.  Has shape [4, 2, 3, 5].
+TENSOR_4D = [[[[('%d%d%d%d' % (i, j, k, l)).encode('utf-8')
+                for l in range(5)]
+               for k in range(3)]
+              for j in range(2)]
+             for i in range(4)]
+
+
+class RaggedRepeatTest(test_util.TensorFlowTestCase, parameterized.TestCase):
+
+  @parameterized.parameters([
+      # Docstring examples
+      dict(
+          data=['a', 'b', 'c'],
+          repeats=[3, 0, 2],
+          axis=0,
+          expected=[b'a', b'a', b'a', b'c', b'c']),
+      dict(
+          data=[[1, 2], [3, 4]],
+          repeats=[2, 3],
+          axis=0,
+          expected=[[1, 2], [1, 2], [3, 4], [3, 4], [3, 4]]),
+      dict(
+          data=[[1, 2], [3, 4]],
+          repeats=[2, 3],
+          axis=1,
+          expected=[[1, 1, 2, 2, 2], [3, 3, 4, 4, 4]]),
+
+      # Scalar repeats value
+      dict(
+          data=['a', 'b', 'c'],
+          repeats=2,
+          axis=0,
+          expected=[b'a', b'a', b'b', b'b', b'c', b'c']),
+      dict(
+          data=[[1, 2], [3, 4]],
+          repeats=2,
+          axis=0,
+          expected=[[1, 2], [1, 2], [3, 4], [3, 4]]),
+      dict(
+          data=[[1, 2], [3, 4]],
+          repeats=2,
+          axis=1,
+          expected=[[1, 1, 2, 2], [3, 3, 4, 4]]),
+
+      # data & repeats are broadcast to have at least one dimension,
+      # so these are all equivalent:
+      dict(data=3, repeats=4, axis=0, expected=[3, 3, 3, 3]),
+      dict(data=[3], repeats=4, axis=0, expected=[3, 3, 3, 3]),
+      dict(data=3, repeats=[4], axis=0, expected=[3, 3, 3, 3]),
+      dict(data=[3], repeats=[4], axis=0, expected=[3, 3, 3, 3]),
+      # Empty tensor
+      dict(data=[], repeats=[], axis=0, expected=[]),
+  ])
+  def testRepeat(self, data, repeats, expected, axis=None):
+    result = ragged_util.repeat(data, repeats, axis)
+    with self.test_session():
+      self.assertEqual(result.eval().tolist(), expected)
+
+  @parameterized.parameters([
+      dict(mode=mode, **args)
+      for mode in ['constant', 'dynamic', 'unknown_shape']
+      for args in [
+          # data & repeats are broadcast to have at least one dimension,
+          # so these are all equivalent:
+          dict(data=3, repeats=4, axis=0),
+          dict(data=[3], repeats=4, axis=0),
+          dict(data=3, repeats=[4], axis=0),
+          dict(data=[3], repeats=[4], axis=0),
+
+          # 1-dimensional data tensor.
+          dict(data=[], repeats=5, axis=0),
+          dict(data=[1, 2, 3], repeats=5, axis=0),
+          dict(data=[1, 2, 3], repeats=[3, 0, 2], axis=0),
+          dict(data=[1, 2, 3], repeats=[3, 0, 2], axis=-1),
+          dict(data=[b'a', b'b', b'c'], repeats=[3, 0, 2], axis=0),
+
+          # 2-dimensional data tensor.
+          dict(data=[[1, 2, 3], [4, 5, 6]], repeats=3, axis=0),
+          dict(data=[[1, 2, 3], [4, 5, 6]], repeats=3, axis=1),
+          dict(data=[[1, 2, 3], [4, 5, 6]], repeats=[3, 5], axis=0),
+          dict(data=[[1, 2, 3], [4, 5, 6]], repeats=[3, 5, 7], axis=1),
+
+          # 3-dimensional data tensor: shape=[4, 2, 3].
+          dict(data=TENSOR_3D, repeats=2, axis=0),
+          dict(data=TENSOR_3D, repeats=2, axis=1),
+          dict(data=TENSOR_3D, repeats=2, axis=2),
+          dict(data=TENSOR_3D, repeats=[2, 0, 4, 1], axis=0),
+          dict(data=TENSOR_3D, repeats=[3, 2], axis=1),
+          dict(data=TENSOR_3D, repeats=[1, 3, 1], axis=2),
+
+          # 4-dimensional data tensor: shape=[4, 2, 3, 5].
+          dict(data=TENSOR_4D, repeats=2, axis=0),
+          dict(data=TENSOR_4D, repeats=2, axis=1),
+          dict(data=TENSOR_4D, repeats=2, axis=2),
+          dict(data=TENSOR_4D, repeats=2, axis=3),
+          dict(data=TENSOR_4D, repeats=[2, 0, 4, 1], axis=0),
+          dict(data=TENSOR_4D, repeats=[3, 2], axis=1),
+          dict(data=TENSOR_4D, repeats=[1, 3, 1], axis=2),
+          dict(data=TENSOR_4D, repeats=[1, 3, 0, 0, 2], axis=3),
+      ]
+  ])
+  def testValuesMatchesNumpy(self, mode, data, repeats, axis):
+    # Exception: we can't handle negative axis if data.ndims is unknown.
+    if axis < 0 and mode == 'unknown_shape':
+      return
+
+    expected = np.repeat(data, repeats, axis)
+
+    if mode == 'constant':
+      data = constant_op.constant(data)
+      repeats = constant_op.constant(repeats)
+    elif mode == 'dynamic':
+      data = constant_op.constant(data)
+      repeats = constant_op.constant(repeats)
+      data = array_ops.placeholder_with_default(data, data.shape)
+      repeats = array_ops.placeholder_with_default(repeats, repeats.shape)
+    elif mode == 'unknown_shape':
+      data = array_ops.placeholder_with_default(data, None)
+      repeats = array_ops.placeholder_with_default(repeats, None)
+
+    result = ragged_util.repeat(data, repeats, axis)
+    with self.test_session():
+      self.assertEqual(result.eval().tolist(), expected.tolist())
+
+  @parameterized.parameters([
+      dict(
+          descr='axis >= rank(data)',
+          mode='dynamic',
+          data=[1, 2, 3],
+          repeats=[3, 0, 2],
+          axis=1,
+          error='axis=1 out of bounds: expected -1<=axis<1'),
+      dict(
+          descr='axis < -rank(data)',
+          mode='dynamic',
+          data=[1, 2, 3],
+          repeats=[3, 0, 2],
+          axis=-2,
+          error='axis=-2 out of bounds: expected -1<=axis<1'),
+      dict(
+          descr='len(repeats) != data.shape[axis]',
+          mode='dynamic',
+          data=[[1, 2, 3], [4, 5, 6]],
+          repeats=[2, 3],
+          axis=1,
+          error='Dimensions 3 and 2 are not compatible'),
+      dict(
+          descr='rank(repeats) > 1',
+          mode='dynamic',
+          data=[[1, 2, 3], [4, 5, 6]],
+          repeats=[[3], [5]],
+          axis=1,
+          error=r'Shape \(2, 1\) must have rank at most 1'),
+      dict(
+          descr='non-integer axis',
+          mode='constant',
+          data=[1, 2, 3],
+          repeats=2,
+          axis='foo',
+          exception=TypeError,
+          error='axis must be an int'),
+  ])
+  def testError(self,
+                descr,
+                mode,
+                data,
+                repeats,
+                axis,
+                exception=ValueError,
+                error=None):
+    # Make sure that this is also an error case for numpy.
+    with self.assertRaises(exception):
+      np.repeat(data, repeats, axis)
+
+    if mode == 'constant':
+      data = constant_op.constant(data)
+      repeats = constant_op.constant(repeats)
+    elif mode == 'dynamic':
+      data = constant_op.constant(data)
+      repeats = constant_op.constant(repeats)
+      data = array_ops.placeholder_with_default(data, data.shape)
+      repeats = array_ops.placeholder_with_default(repeats, repeats.shape)
+    elif mode == 'unknown_shape':
+      data = array_ops.placeholder_with_default(data, None)
+      repeats = array_ops.placeholder_with_default(repeats, None)
+
+    with self.assertRaisesRegexp(exception, error):
+      ragged_util.repeat(data, repeats, axis)
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/python/ops/ragged/ragged_where_op_test.py b/tensorflow/python/ops/ragged/ragged_where_op_test.py
new file mode 100644
index 00000000000..755333de392
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_where_op_test.py
@@ -0,0 +1,199 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for ragged.where."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import ragged
+from tensorflow.python.platform import googletest
+
+
+class RaggedWhereOpTest(test_util.TensorFlowTestCase, parameterized.TestCase):
+
+  @parameterized.parameters([
+      #=========================================================================
+      # Docstring Examples
+      #=========================================================================
+      dict(  # shape=[D1, (D2)]
+          condition=ragged.constant_value([[True, False, True], [False, True]]),
+          expected=[[0, 0], [0, 2], [1, 1]]),
+      dict(  # shape=[D1, (D2)]
+          condition=ragged.constant_value([[True, False, True], [False, True]]),
+          x=ragged.constant_value([['A', 'B', 'C'], ['D', 'E']]),
+          y=ragged.constant_value([['a', 'b', 'c'], ['d', 'e']]),
+          expected=ragged.constant_value([[b'A', b'b', b'C'], [b'd', b'E']])),
+      dict(  # shape=[D1, (D2)]
+          condition=ragged.constant_value([True, False]),
+          x=ragged.constant_value([['A', 'B', 'C'], ['D', 'E']]),
+          y=ragged.constant_value([['a', 'b', 'c'], ['d', 'e']]),
+          expected=ragged.constant_value([[b'A', b'B', b'C'], [b'd', b'e']])),
+      #=========================================================================
+      # Coordinate-retrieval mode
+      #=========================================================================
+      dict(  # shape=[D1]
+          condition=[True, False, True, False, True],
+          expected=[[0], [2], [4]]),
+      dict(  # shape=[D1, D2]
+          condition=[[True, False], [False, True]],
+          expected=[[0, 0], [1, 1]]),
+      dict(  # shape=[D1, (D2)]
+          condition=ragged.constant_value([[True, False, True], [False, True]]),
+          expected=[[0, 0], [0, 2], [1, 1]]),
+      dict(  # shape=[D1, (D2), (D3)]
+          condition=ragged.constant_value([
+              [[True, False, True], [False, True]],
+              [[True], [], [False], [False, True, False]]
+          ]),
+          expected=[[0, 0, 0], [0, 0, 2], [0, 1, 1],
+                    [1, 0, 0], [1, 3, 1]]),
+      dict(  # shape=[D1, (D2), D3]
+          condition=ragged.constant_value([
+              [[True, False], [False, True]],
+              [[True, False], [False, False], [True, False], [False, True]]
+          ], ragged_rank=1),
+          expected=[[0, 0, 0], [0, 1, 1],
+                    [1, 0, 0], [1, 2, 0], [1, 3, 1]]),
+      dict(  # shape=[D1, (D2), (D3), (D4)]
+          condition=ragged.constant_value([
+              [[[], [True]]],
+              [[[True, False, True], [False, True]],
+               [[True], [], [False], [False, True, False]]]
+          ]),
+          expected=[[0, 0, 1, 0],
+                    [1, 0, 0, 0], [1, 0, 0, 2], [1, 0, 1, 1],
+                    [1, 1, 0, 0], [1, 1, 3, 1]]),
+
+      #=========================================================================
+      # Elementwise value-selection mode
+      #=========================================================================
+      dict(  # shape=[]
+          condition=True, x='A', y='a', expected=b'A'),
+      dict(  # shape=[]
+          condition=False, x='A', y='a', expected=b'a'),
+      dict(  # shape=[D1]
+          condition=[True, False, True],
+          x=['A', 'B', 'C'],
+          y=['a', 'b', 'c'],
+          expected=[b'A', b'b', b'C']),
+      dict(  # shape=[D1, D2]
+          condition=[[True, False], [False, True]],
+          x=[['A', 'B'], ['D', 'E']],
+          y=[['a', 'b'], ['d', 'e']],
+          expected=[[b'A', b'b'], [b'd', b'E']]),
+      dict(  # shape=[D1, (D2)]
+          condition=ragged.constant_value([[True, False, True], [False, True]]),
+          x=ragged.constant_value([['A', 'B', 'C'], ['D', 'E']]),
+          y=ragged.constant_value([['a', 'b', 'c'], ['d', 'e']]),
+          expected=ragged.constant_value([[b'A', b'b', b'C'], [b'd', b'E']])),
+      dict(  # shape=[D1, (D2), D3]
+          condition=ragged.constant_value([
+              [[True, False], [False, True]],
+              [[True, False], [False, False], [True, False], [False, True]]
+          ], ragged_rank=1),
+          x=ragged.constant_value([
+              [['A', 'B'], ['C', 'D']],
+              [['E', 'F'], ['G', 'H'], ['I', 'J'], ['K', 'L']]
+          ], ragged_rank=1),
+          y=ragged.constant_value([
+              [['a', 'b'], ['c', 'd']],
+              [['e', 'f'], ['g', 'h'], ['i', 'j'], ['k', 'l']]
+          ], ragged_rank=1),
+          expected=ragged.constant_value([
+              [[b'A', b'b'], [b'c', b'D']],
+              [[b'E', b'f'], [b'g', b'h'], [b'I', b'j'], [b'k', b'L']]
+          ], ragged_rank=1)),
+      dict(  # shape=[D1, (D2), (D3), (D4)]
+          condition=ragged.constant_value([
+              [[[], [True]]],
+              [[[True, False, True], [False, True]],
+               [[True], [], [False], [False, True, False]]]
+          ]),
+          x=ragged.constant_value([
+              [[[], ['A']]],
+              [[['B', 'C', 'D'], ['E', 'F']],
+               [['G'], [], ['H'], ['I', 'J', 'K']]]
+          ]),
+          y=ragged.constant_value([
+              [[[], ['a']]],
+              [[['b', 'c', 'd'], ['e', 'f']],
+               [['g'], [], ['h'], ['i', 'j', 'k']]]
+          ]),
+          expected=ragged.constant_value([
+              [[[], [b'A']]],
+              [[[b'B', b'c', b'D'], [b'e', b'F']],
+               [[b'G'], [], [b'h'], [b'i', b'J', b'k']]]
+          ])),
+
+      #=========================================================================
+      # Elementwise row-selection mode
+      #=========================================================================
+      dict(  # shape=[D1, D2]
+          condition=[True, False, True],
+          x=[['A', 'B'], ['C', 'D'], ['E', 'F']],
+          y=[['a', 'b'], ['c', 'd'], ['e', 'f']],
+          expected=[[b'A', b'B'], [b'c', b'd'], [b'E', b'F']]),
+      dict(  # shape=[D1, (D2)]
+          condition=[True, False, True],
+          x=ragged.constant_value([['A', 'B', 'C'], ['D', 'E'], ['F', 'G']]),
+          y=ragged.constant_value([['a', 'b'], ['c'], ['d', 'e']]),
+          expected=ragged.constant_value([[b'A', b'B', b'C'], [b'c'],
+                                          [b'F', b'G']])),
+      dict(  # shape=[D1, (D2), (D3), (D4)]
+          condition=ragged.constant_value([True, False]),
+          x=ragged.constant_value([
+              [[[], ['A']]],
+              [[['B', 'C', 'D'], ['E', 'F']],
+               [['G'], [], ['H'], ['I', 'J', 'K']]]
+          ]),
+          y=ragged.constant_value([[[['a']]], [[['b']]]]),
+          expected=ragged.constant_value([[[[], [b'A']]], [[[b'b']]]])),
+  ])   # pyformat: disable
+  def testRaggedWhere(self, condition, expected, x=None, y=None):
+    result = ragged.where(condition, x, y)
+    self.assertEqual(
+        getattr(result, 'ragged_rank', 0), getattr(expected, 'ragged_rank', 0))
+    with self.test_session():
+      result_value = result.eval()
+      if hasattr(result_value, 'tolist'):
+        result_value = result_value.tolist()
+      if hasattr(expected, 'tolist'):
+        expected = expected.tolist()
+      self.assertEqual(result_value, expected)
+
+  @parameterized.parameters([
+      dict(
+          condition=[True, False],
+          x=[1, 2],
+          error=ValueError,
+          message='x and y must be either both None or both non-None'),
+      dict(
+          condition=ragged.constant_value([[True, False, True], [False, True]]),
+          x=ragged.constant_value([['A', 'B', 'C'], ['D', 'E']]),
+          y=[['a', 'b'], ['d', 'e']],
+          error=ValueError,
+          message='Input shapes do not match.'),
+  ])
+  def testRaggedWhereErrors(self, condition, error, message, x=None, y=None):
+    with self.assertRaisesRegexp(error, message):
+      ragged.where(condition, x, y)
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/python/ops/ragged/segment_id_ops.py b/tensorflow/python/ops/ragged/segment_id_ops.py
new file mode 100644
index 00000000000..fa2970c3e75
--- /dev/null
+++ b/tensorflow/python/ops/ragged/segment_id_ops.py
@@ -0,0 +1,107 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Ops for converting between row_splits and segment_ids."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.ragged import ragged_util
+
+
+# For background on "segments" and "segment ids", see:
+# https://www.tensorflow.org/api_guides/python/math_ops#Segmentation
+def row_splits_to_segment_ids(splits, name=None):
+  """Generates the segmentation corresponding to a RaggedTensor `splits` vector.
+
+  Returns an integer vector `segment_ids`, where `segment_ids[i] == j` if
+  `splits[j] <= i < splits[j+1]`.  Example:
+
+  ```python
+  >>> ragged.row_splits_to_segment_ids([0, 3, 3, 5, 6, 9]).eval()
+  [ 0 0 0 2 2 3 4 4 4 ]
+  ```
+
+  Args:
+    splits: A sorted 1-D int64 Tensor.  `splits[0]` must be zero.
+    name: A name prefix for the returned tensor (optional).
+
+  Returns:
+    A sorted 1-D int64 Tensor, with `shape=[splits[-1]]`
+
+  Raises:
+    ValueError: If `splits` is invalid.
+  """
+  with ops.name_scope(name, "RaggedSplitsToSegmentIds", [splits]) as name:
+    splits = ops.convert_to_tensor(splits, dtype=dtypes.int64, name="splits")
+    splits.shape.assert_has_rank(1)
+    if tensor_shape.dimension_value(splits.shape[0]) == 0:
+      raise ValueError("Invalid row_splits: []")
+    row_lengths = splits[1:] - splits[:-1]
+    nrows = array_ops.shape(splits, out_type=dtypes.int64)[-1] - 1
+    indices = math_ops.range(nrows)
+    return ragged_util.repeat(indices, repeats=row_lengths, axis=0)
+
+
+# For background on "segments" and "segment ids", see:
+# https://www.tensorflow.org/api_guides/python/math_ops#Segmentation
+def segment_ids_to_row_splits(segment_ids, num_segments=None, name=None):
+  """Generates the RaggedTensor `splits` vector corresponding to a segmentation.
+
+  Returns an integer vector `splits`, where `splits[0] = 0` and
+  `splits[i] = splits[i-1] + count(segment_ids==i)`.  Example:
+
+  ```python
+  >>> ragged.segment_ids_to_row_splits([0, 0, 0, 2, 2, 3, 4, 4, 4]).eval()
+  [ 0 3 3 5 6 9 ]
+  ```
+
+  Args:
+    segment_ids: A 1-D integer Tensor.
+    num_segments: A scalar integer indicating the number of segments.  Defaults
+      to `max(segment_ids) + 1` (or zero if `segment_ids` is empty).
+    name: A name prefix for the returned tensor (optional).
+
+  Returns:
+    A sorted 1-D int64 Tensor, with `shape=[num_segments + 1]`.
+  """
+  with ops.name_scope(name, "SegmentIdsToRaggedSplits", [segment_ids]) as name:
+    segment_ids = ragged_util.convert_to_int_tensor(segment_ids, "segment_ids")
+    segment_ids.shape.assert_has_rank(1)
+    if num_segments is not None:
+      num_segments = ragged_util.convert_to_int_tensor(num_segments,
+                                                       "num_segments")
+      num_segments.shape.assert_has_rank(0)
+
+    row_lengths = math_ops.bincount(
+        segment_ids,
+        minlength=num_segments,
+        maxlength=num_segments,
+        dtype=dtypes.int64)
+    splits = array_ops.concat([[0], math_ops.cumsum(row_lengths)], axis=0)
+
+    # Update shape information, if possible.
+    if num_segments is not None:
+      const_num_segments = tensor_util.constant_value(num_segments)
+      if const_num_segments is not None:
+        splits.set_shape(tensor_shape.TensorShape([const_num_segments + 1]))
+
+    return splits
diff --git a/tensorflow/python/ops/random_ops.py b/tensorflow/python/ops/random_ops.py
index f827a20ff8f..1f7db0af61f 100644
--- a/tensorflow/python/ops/random_ops.py
+++ b/tensorflow/python/ops/random_ops.py
@@ -44,7 +44,8 @@ def _ShapeTensor(shape):
   return ops.convert_to_tensor(shape, dtype=dtype, name="shape")
 
 
-@tf_export("random.normal", "random_normal")
+@tf_export("random.normal", v1=["random.normal", "random_normal"])
+@deprecation.deprecated_endpoints("random_normal")
 def random_normal(shape,
                   mean=0.0,
                   stddev=1.0,
@@ -182,7 +183,8 @@ ops.NotDifferentiable("ParameterizedTruncatedNormal")
 ops.NotDifferentiable("TruncatedNormal")
 
 
-@tf_export("random.uniform", "random_uniform")
+@tf_export("random.uniform", v1=["random.uniform", "random_uniform"])
+@deprecation.deprecated_endpoints("random_uniform")
 def random_uniform(shape,
                    minval=0,
                    maxval=None,
@@ -247,7 +249,8 @@ def random_uniform(shape,
 ops.NotDifferentiable("RandomUniform")
 
 
-@tf_export("random.shuffle", "random_shuffle")
+@tf_export("random.shuffle", v1=["random.shuffle", "random_shuffle"])
+@deprecation.deprecated_endpoints("random_shuffle")
 def random_shuffle(value, seed=None, name=None):
   """Randomly shuffles a tensor along its first dimension.
 
@@ -278,7 +281,8 @@ def random_shuffle(value, seed=None, name=None):
       value, seed=seed1, seed2=seed2, name=name)
 
 
-@tf_export("image.random_crop", "random_crop")
+@tf_export("image.random_crop", v1=["image.random_crop", "random_crop"])
+@deprecation.deprecated_endpoints("random_crop")
 def random_crop(value, size, seed=None, name=None):
   """Randomly crops a tensor to a given size.
 
diff --git a/tensorflow/python/ops/rnn.py b/tensorflow/python/ops/rnn.py
index 393e269abf5..57ecb505573 100644
--- a/tensorflow/python/ops/rnn.py
+++ b/tensorflow/python/ops/rnn.py
@@ -32,6 +32,7 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import rnn_cell_impl
 from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.ops import variable_scope as vs
+from tensorflow.python.util import deprecation
 from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import tf_export
 
@@ -480,7 +481,10 @@ def bidirectional_dynamic_rnn(cell_fw, cell_bw, inputs, sequence_length=None,
   return (outputs, output_states)
 
 
-@tf_export("nn.dynamic_rnn")
+@deprecation.deprecated(
+    None,
+    "Please use `keras.layers.RNN(cell)`, which is equivalent to this API")
+@tf_export(v1=["nn.dynamic_rnn"])
 def dynamic_rnn(cell, inputs, sequence_length=None, initial_state=None,
                 dtype=None, parallel_iterations=None, swap_memory=False,
                 time_major=False, scope=None):
@@ -891,7 +895,7 @@ def _dynamic_rnn_loop(cell,
   return (final_outputs, final_state)
 
 
-@tf_export("nn.raw_rnn")
+@tf_export(v1=["nn.raw_rnn"])
 def raw_rnn(cell, loop_fn,
             parallel_iterations=None, swap_memory=False, scope=None):
   """Creates an `RNN` specified by RNNCell `cell` and loop function `loop_fn`.
@@ -1210,7 +1214,10 @@ def raw_rnn(cell, loop_fn,
     return (emit_ta, final_state, final_loop_state)
 
 
-@tf_export("nn.static_rnn")
+@deprecation.deprecated(
+    None, "Please use `keras.layers.RNN(cell, unroll=True)`, "
+    "which is equivalent to this API")
+@tf_export(v1=["nn.static_rnn"])
 def static_rnn(cell,
                inputs,
                initial_state=None,
diff --git a/tensorflow/python/ops/rnn_cell_impl.py b/tensorflow/python/ops/rnn_cell_impl.py
index 5a2dd9dec84..050b4868939 100644
--- a/tensorflow/python/ops/rnn_cell_impl.py
+++ b/tensorflow/python/ops/rnn_cell_impl.py
@@ -462,7 +462,7 @@ class BasicRNNCell(LayerRNNCell):
     return dict(list(base_config.items()) + list(config.items()))
 
 
-@tf_export("nn.rnn_cell.GRUCell")
+@tf_export(v1=["nn.rnn_cell.GRUCell"])
 class GRUCell(LayerRNNCell):
   """Gated Recurrent Unit cell (cf. http://arxiv.org/abs/1406.1078).
 
@@ -488,6 +488,8 @@ class GRUCell(LayerRNNCell):
       `trainable` etc when constructing the cell from configs of get_config().
   """
 
+  @deprecated(None, "This class is equivalent as tf.keras.layers.GRUCell,"
+                    " and will be replaced by that in Tensorflow 2.0.")
   def __init__(self,
                num_units,
                activation=None,
@@ -610,8 +612,7 @@ class LSTMStateTuple(_LSTMStateTuple):
     return c.dtype
 
 
-# TODO(scottzhu): Stop exporting this class in TF 2.0.
-@tf_export("nn.rnn_cell.BasicLSTMCell")
+@tf_export(v1=["nn.rnn_cell.BasicLSTMCell"])
 class BasicLSTMCell(LayerRNNCell):
   """DEPRECATED: Please use `tf.nn.rnn_cell.LSTMCell` instead.
 
@@ -634,10 +635,8 @@ class BasicLSTMCell(LayerRNNCell):
   better performance on CPU.
   """
 
-  @deprecated(None, "This class is deprecated, please use "
-                    "tf.nn.rnn_cell.LSTMCell, which supports all the feature "
-                    "this cell currently has. Please replace the existing code "
-                    "with tf.nn.rnn_cell.LSTMCell(name='basic_lstm_cell').")
+  @deprecated(None, "This class is equivalent as tf.keras.layers.LSTMCell,"
+                    " and will be replaced by that in Tensorflow 2.0.")
   def __init__(self,
                num_units,
                forget_bias=1.0,
@@ -779,7 +778,7 @@ class BasicLSTMCell(LayerRNNCell):
     return dict(list(base_config.items()) + list(config.items()))
 
 
-@tf_export("nn.rnn_cell.LSTMCell")
+@tf_export(v1=["nn.rnn_cell.LSTMCell"])
 class LSTMCell(LayerRNNCell):
   """Long short-term memory unit (LSTM) recurrent network cell.
 
@@ -807,6 +806,8 @@ class LSTMCell(LayerRNNCell):
   better performance on CPU.
   """
 
+  @deprecated(None, "This class is equivalent as tf.keras.layers.LSTMCell,"
+                    " and will be replaced by that in Tensorflow 2.0.")
   def __init__(self, num_units,
                use_peepholes=False, cell_clip=None,
                initializer=None, num_proj=None, proj_clip=None,
diff --git a/tensorflow/python/ops/script_ops.py b/tensorflow/python/ops/script_ops.py
index 2ec4b540fb6..a5b31aff916 100644
--- a/tensorflow/python/ops/script_ops.py
+++ b/tensorflow/python/ops/script_ops.py
@@ -38,6 +38,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_script_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.util import compat
+from tensorflow.python.util import deprecation
 from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import tf_export
 
@@ -305,13 +306,14 @@ def _EagerPyFuncGrad(op, *dy):
         is_grad_func=True)
 
 
+@tf_export("py_function")
 def eager_py_func(func, inp, Tout, name=None):
   """Wraps a python function into a TensorFlow op that executes it eagerly.
 
   This function allows expressing computations in a TensorFlow graph as
   Python functions. In particular, it wraps a Python function `func`
   in a once-differentiable TensorFlow operation that executes it with eager
-  exeuction enabled. As a consequence, `tf.contrib.eager.py_func` makes it
+  execution enabled. As a consequence, `tf.contrib.eager.py_func` makes it
   possible to express control flow using Python constructs (`if`, `while`,
   `for`, etc.), instead of TensorFlow control flow constructs (`tf.cond`,
   `tf.while_loop`). For example, you might use `tf.contrib.eager.py_func` to
@@ -387,7 +389,16 @@ def eager_py_func(func, inp, Tout, name=None):
   return _internal_py_func(func=func, inp=inp, Tout=Tout, eager=True, name=name)
 
 
-@tf_export("py_func")
+@deprecation.deprecated(
+    date=None,
+    instructions="""tf.py_func is deprecated in TF V2. Instead, use
+    tf.py_function, which takes a python function which manipulates tf eager
+    tensors instead of numpy arrays. It's easy to convert a tf eager tensor to
+    an ndarray (just call tensor.numpy()) but having access to eager tensors
+    means `tf.py_function`s can use accelerators such as GPUs as well as
+    being differentiable using a gradient tape.
+    """)
+@tf_export(v1=["py_func"])
 def py_func(func, inp, Tout, stateful=True, name=None):
   """Wraps a python function and uses it as a TensorFlow op.
 
diff --git a/tensorflow/python/ops/signal/BUILD b/tensorflow/python/ops/signal/BUILD
new file mode 100644
index 00000000000..0d04dc0c1bf
--- /dev/null
+++ b/tensorflow/python/ops/signal/BUILD
@@ -0,0 +1,23 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+py_library(
+    name = "signal",
+    srcs = glob(["*.py"]),
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:spectral_ops",
+        "//tensorflow/python:tensor_util",
+        "//tensorflow/python:util",
+        "//third_party/py/numpy",
+    ],
+)
diff --git a/tensorflow/python/ops/signal/__init__.py b/tensorflow/python/ops/signal/__init__.py
new file mode 100644
index 00000000000..3fa4e94e588
--- /dev/null
+++ b/tensorflow/python/ops/signal/__init__.py
@@ -0,0 +1,37 @@
+"""Signal processing operations.
+
+See the [tf.signal](https://tensorflow.org/api_guides/python/contrib.signal)
+guide.
+
+@@frame
+@@hamming_window
+@@hann_window
+@@inverse_stft
+@@inverse_stft_window_fn
+@@mfccs_from_log_mel_spectrograms
+@@linear_to_mel_weight_matrix
+@@overlap_and_add
+@@stft
+
+[hamming]: https://en.wikipedia.org/wiki/Window_function#Hamming_window
+[hann]: https://en.wikipedia.org/wiki/Window_function#Hann_window
+[mel]: https://en.wikipedia.org/wiki/Mel_scale
+[mfcc]: https://en.wikipedia.org/wiki/Mel-frequency_cepstrum
+[stft]: https://en.wikipedia.org/wiki/Short-time_Fourier_transform
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# pylint: disable=unused-import
+from tensorflow.python.ops.signal.mel_ops import linear_to_mel_weight_matrix
+from tensorflow.python.ops.signal.mfcc_ops import mfccs_from_log_mel_spectrograms
+from tensorflow.python.ops.signal.reconstruction_ops import overlap_and_add
+from tensorflow.python.ops.signal.shape_ops import frame
+from tensorflow.python.ops.signal.spectral_ops import inverse_stft
+from tensorflow.python.ops.signal.spectral_ops import inverse_stft_window_fn
+from tensorflow.python.ops.signal.spectral_ops import stft
+from tensorflow.python.ops.signal.window_ops import hamming_window
+from tensorflow.python.ops.signal.window_ops import hann_window
+# pylint: enable=unused-import
diff --git a/tensorflow/contrib/signal/python/ops/mel_ops.py b/tensorflow/python/ops/signal/mel_ops.py
similarity index 98%
rename from tensorflow/contrib/signal/python/ops/mel_ops.py
rename to tensorflow/python/ops/signal/mel_ops.py
index ecc2fedb9f8..6488e1df59b 100644
--- a/tensorflow/contrib/signal/python/ops/mel_ops.py
+++ b/tensorflow/python/ops/signal/mel_ops.py
@@ -18,11 +18,13 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.signal.python.ops import shape_ops
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.signal import shape_ops
+from tensorflow.python.util.tf_export import tf_export
+
 
 # mel spectrum constants.
 _MEL_BREAK_FREQUENCY_HERTZ = 700.0
@@ -85,6 +87,7 @@ def _validate_arguments(num_mel_bins, sample_rate,
     raise ValueError('dtype must be a floating point type. Got: %s' % dtype)
 
 
+@tf_export('signal.linear_to_mel_weight_matrix')
 def linear_to_mel_weight_matrix(num_mel_bins=20,
                                 num_spectrogram_bins=129,
                                 sample_rate=8000,
diff --git a/tensorflow/contrib/signal/python/ops/mfcc_ops.py b/tensorflow/python/ops/signal/mfcc_ops.py
similarity index 92%
rename from tensorflow/contrib/signal/python/ops/mfcc_ops.py
rename to tensorflow/python/ops/signal/mfcc_ops.py
index b379db55dae..6ae3b222ba5 100644
--- a/tensorflow/contrib/signal/python/ops/mfcc_ops.py
+++ b/tensorflow/python/ops/signal/mfcc_ops.py
@@ -23,8 +23,10 @@ from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import spectral_ops
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export('signal.mfccs_from_log_mel_spectrograms')
 def mfccs_from_log_mel_spectrograms(log_mel_spectrograms, name=None):
   """Computes [MFCCs][mfcc] of `log_mel_spectrograms`.
 
@@ -48,14 +50,14 @@ def mfccs_from_log_mel_spectrograms(log_mel_spectrograms, name=None):
   pcm = tf.placeholder(tf.float32, [None, None])
 
   # A 1024-point STFT with frames of 64 ms and 75% overlap.
-  stfts = tf.contrib.signal.stft(pcm, frame_length=1024, frame_step=256,
-                                 fft_length=1024)
+  stfts = tf.signal.stft(pcm, frame_length=1024, frame_step=256,
+                         fft_length=1024)
   spectrograms = tf.abs(stfts)
 
   # Warp the linear scale spectrograms into the mel-scale.
   num_spectrogram_bins = stfts.shape[-1].value
   lower_edge_hertz, upper_edge_hertz, num_mel_bins = 80.0, 7600.0, 80
-  linear_to_mel_weight_matrix = tf.contrib.signal.linear_to_mel_weight_matrix(
+  linear_to_mel_weight_matrix = tf.signal.linear_to_mel_weight_matrix(
     num_mel_bins, num_spectrogram_bins, sample_rate, lower_edge_hertz,
     upper_edge_hertz)
   mel_spectrograms = tf.tensordot(
@@ -67,7 +69,7 @@ def mfccs_from_log_mel_spectrograms(log_mel_spectrograms, name=None):
   log_mel_spectrograms = tf.log(mel_spectrograms + 1e-6)
 
   # Compute MFCCs from log_mel_spectrograms and take the first 13.
-  mfccs = tf.contrib.signal.mfccs_from_log_mel_spectrograms(
+  mfccs = tf.signal.mfccs_from_log_mel_spectrograms(
     log_mel_spectrograms)[..., :13]
   ```
 
diff --git a/tensorflow/contrib/signal/python/ops/reconstruction_ops.py b/tensorflow/python/ops/signal/reconstruction_ops.py
similarity index 97%
rename from tensorflow/contrib/signal/python/ops/reconstruction_ops.py
rename to tensorflow/python/ops/signal/reconstruction_ops.py
index 503b33a54df..0fc7fec2393 100644
--- a/tensorflow/contrib/signal/python/ops/reconstruction_ops.py
+++ b/tensorflow/python/ops/signal/reconstruction_ops.py
@@ -18,12 +18,13 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.signal.python.ops import shape_ops
-from tensorflow.contrib.signal.python.ops import util_ops
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.signal import shape_ops
+from tensorflow.python.ops.signal import util_ops
+from tensorflow.python.util.tf_export import tf_export
 
 
 def _shuffle_to_front(input_tensor, k):
@@ -57,6 +58,7 @@ def _shuffle_to_front(input_tensor, k):
   return array_ops.transpose(input_tensor, perm=permutation)
 
 
+@tf_export("signal.overlap_and_add")
 def overlap_and_add(signal, frame_step, name=None):
   """Reconstructs a signal from a framed representation.
 
diff --git a/tensorflow/contrib/signal/python/ops/shape_ops.py b/tensorflow/python/ops/signal/shape_ops.py
similarity index 97%
rename from tensorflow/contrib/signal/python/ops/shape_ops.py
rename to tensorflow/python/ops/signal/shape_ops.py
index 91862f0cc0b..02dd7c97e8f 100644
--- a/tensorflow/contrib/signal/python/ops/shape_ops.py
+++ b/tensorflow/python/ops/signal/shape_ops.py
@@ -18,13 +18,12 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-
-from tensorflow.contrib.signal.python.ops import util_ops
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
-
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.signal import util_ops
+from tensorflow.python.util.tf_export import tf_export
 
 
 def _infer_frame_shape(signal, frame_length, frame_step, pad_end, axis):
@@ -53,6 +52,7 @@ def _infer_frame_shape(signal, frame_length, frame_step, pad_end, axis):
   return outer_dimensions + [num_frames, frame_length] + inner_dimensions
 
 
+@tf_export("signal.frame")
 def frame(signal, frame_length, frame_step, pad_end=False, pad_value=0, axis=-1,
           name=None):
   """Expands `signal`'s `axis` dimension into frames of `frame_length`.
@@ -70,7 +70,7 @@ def frame(signal, frame_length, frame_step, pad_end=False, pad_value=0, axis=-1,
 
   ```python
   pcm = tf.placeholder(tf.float32, [None, 9152])
-  frames = tf.contrib.signal.frame(pcm, 512, 180)
+  frames = tf.signal.frame(pcm, 512, 180)
   magspec = tf.abs(tf.spectral.rfft(frames, [512]))
   image = tf.expand_dims(magspec, 3)
   ```
diff --git a/tensorflow/contrib/signal/python/ops/spectral_ops.py b/tensorflow/python/ops/signal/spectral_ops.py
similarity index 92%
rename from tensorflow/contrib/signal/python/ops/spectral_ops.py
rename to tensorflow/python/ops/signal/spectral_ops.py
index a8b5deff6ca..b0b7d964b93 100644
--- a/tensorflow/contrib/signal/python/ops/spectral_ops.py
+++ b/tensorflow/python/ops/signal/spectral_ops.py
@@ -18,23 +18,23 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import functools
-
 import numpy as np
 
-from tensorflow.contrib.signal.python.ops import reconstruction_ops
-from tensorflow.contrib.signal.python.ops import shape_ops
-from tensorflow.contrib.signal.python.ops import window_ops
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import spectral_ops
+from tensorflow.python.ops.signal import reconstruction_ops
+from tensorflow.python.ops.signal import shape_ops
+from tensorflow.python.ops.signal import window_ops
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export('signal.stft')
 def stft(signals, frame_length, frame_step, fft_length=None,
-         window_fn=functools.partial(window_ops.hann_window, periodic=True),
+         window_fn=window_ops.hann_window,
          pad_end=False, name=None):
   """Computes the [Short-time Fourier Transform][stft] of `signals`.
 
@@ -91,9 +91,9 @@ def stft(signals, frame_length, frame_step, fft_length=None,
     return spectral_ops.rfft(framed_signals, [fft_length])
 
 
+@tf_export('signal.inverse_stft_window_fn')
 def inverse_stft_window_fn(frame_step,
-                           forward_window_fn=functools.partial(
-                               window_ops.hann_window, periodic=True),
+                           forward_window_fn=window_ops.hann_window,
                            name=None):
   """Generates a window function that can be used in `inverse_stft`.
 
@@ -152,18 +152,18 @@ def inverse_stft_window_fn(frame_step,
   return inverse_stft_window_fn_inner
 
 
+@tf_export('signal.inverse_stft')
 def inverse_stft(stfts,
                  frame_length,
                  frame_step,
                  fft_length=None,
-                 window_fn=functools.partial(window_ops.hann_window,
-                                             periodic=True),
+                 window_fn=window_ops.hann_window,
                  name=None):
   """Computes the inverse [Short-time Fourier Transform][stft] of `stfts`.
 
   To reconstruct an original waveform, a complimentary window function should
   be used in inverse_stft. Such a window function can be constructed with
-  tf.contrib.signal.inverse_stft_window_fn.
+  tf.signal.inverse_stft_window_fn.
 
   Example:
 
@@ -171,10 +171,10 @@ def inverse_stft(stfts,
   frame_length = 400
   frame_step = 160
   waveform = tf.placeholder(dtype=tf.float32, shape=[1000])
-  stft = tf.contrib.signal.stft(waveform, frame_length, frame_step)
-  inverse_stft = tf.contrib.signal.inverse_stft(
+  stft = tf.signal.stft(waveform, frame_length, frame_step)
+  inverse_stft = tf.signal.inverse_stft(
       stft, frame_length, frame_step,
-      window_fn=tf.contrib.signal.inverse_stft_window_fn(frame_step))
+      window_fn=tf.signal.inverse_stft_window_fn(frame_step))
   ```
 
   if a custom window_fn is used in stft, it must be passed to
@@ -185,11 +185,11 @@ def inverse_stft(stfts,
   frame_step = 160
   window_fn = functools.partial(window_ops.hamming_window, periodic=True),
   waveform = tf.placeholder(dtype=tf.float32, shape=[1000])
-  stft = tf.contrib.signal.stft(
+  stft = tf.signal.stft(
       waveform, frame_length, frame_step, window_fn=window_fn)
-  inverse_stft = tf.contrib.signal.inverse_stft(
+  inverse_stft = tf.signal.inverse_stft(
       stft, frame_length, frame_step,
-      window_fn=tf.contrib.signal.inverse_stft_window_fn(
+      window_fn=tf.signal.inverse_stft_window_fn(
          frame_step, forward_window_fn=window_fn))
   ```
 
diff --git a/tensorflow/contrib/signal/python/ops/util_ops.py b/tensorflow/python/ops/signal/util_ops.py
similarity index 100%
rename from tensorflow/contrib/signal/python/ops/util_ops.py
rename to tensorflow/python/ops/signal/util_ops.py
diff --git a/tensorflow/contrib/signal/python/ops/window_ops.py b/tensorflow/python/ops/signal/window_ops.py
similarity index 97%
rename from tensorflow/contrib/signal/python/ops/window_ops.py
rename to tensorflow/python/ops/signal/window_ops.py
index 59e67e8ba41..730c989cfe9 100644
--- a/tensorflow/contrib/signal/python/ops/window_ops.py
+++ b/tensorflow/python/ops/signal/window_ops.py
@@ -27,8 +27,10 @@ from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export('signal.hann_window')
 def hann_window(window_length, periodic=True, dtype=dtypes.float32, name=None):
   """Generate a [Hann window][hann].
 
@@ -53,6 +55,7 @@ def hann_window(window_length, periodic=True, dtype=dtypes.float32, name=None):
                                dtype, 0.5, 0.5)
 
 
+@tf_export('signal.hamming_window')
 def hamming_window(window_length, periodic=True, dtype=dtypes.float32,
                    name=None):
   """Generate a [Hamming][hamming] window.
diff --git a/tensorflow/python/ops/summary_ops.py b/tensorflow/python/ops/summary_ops.py
deleted file mode 100644
index ec4d4a6e924..00000000000
--- a/tensorflow/python/ops/summary_ops.py
+++ /dev/null
@@ -1,87 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Summary Operations."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.core.framework import summary_pb2
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import gen_logging_ops
-from tensorflow.python.ops import summary_op_util
-# go/tf-wildcard-import
-# pylint: disable=wildcard-import
-from tensorflow.python.ops.gen_logging_ops import *
-from tensorflow.python.util.tf_export import tf_export
-# pylint: enable=wildcard-import
-
-
-@tf_export("summary.tensor_summary")
-def tensor_summary(name,
-                   tensor,
-                   summary_description=None,
-                   collections=None,
-                   summary_metadata=None,
-                   family=None,
-                   display_name=None):
-  """Outputs a `Summary` protocol buffer with a serialized tensor.proto.
-
-  Args:
-    name: A name for the generated node. If display_name is not set, it will
-      also serve as the tag name in TensorBoard. (In that case, the tag
-      name will inherit tf name scopes.)
-    tensor: A tensor of any type and shape to serialize.
-    summary_description: A long description of the summary sequence. Markdown
-      is supported.
-    collections: Optional list of graph collections keys. The new summary op is
-      added to these collections. Defaults to `[GraphKeys.SUMMARIES]`.
-    summary_metadata: Optional SummaryMetadata proto (which describes which
-      plugins may use the summary value).
-    family: Optional; if provided, used as the prefix of the summary tag,
-      which controls the name used for display on TensorBoard when
-      display_name is not set.
-    display_name: A string used to name this data in TensorBoard. If this is
-      not set, then the node name will be used instead.
-
-  Returns:
-    A scalar `Tensor` of type `string`. The serialized `Summary` protocol
-    buffer.
-  """
-
-  if summary_metadata is None:
-    summary_metadata = summary_pb2.SummaryMetadata()
-
-  if summary_description is not None:
-    summary_metadata.summary_description = summary_description
-
-  if display_name is not None:
-    summary_metadata.display_name = display_name
-
-  serialized_summary_metadata = summary_metadata.SerializeToString()
-
-  if summary_op_util.skip_summary():
-    return constant_op.constant("")
-  with summary_op_util.summary_scope(
-      name, family, values=[tensor]) as (tag, scope):
-    val = gen_logging_ops.tensor_summary_v2(
-        tensor=tensor,
-        tag=tag,
-        name=scope,
-        serialized_summary_metadata=serialized_summary_metadata)
-    summary_op_util.collect(val, collections, [ops.GraphKeys.SUMMARIES])
-  return val
-
-ops.NotDifferentiable("TensorSummary")
diff --git a/tensorflow/python/ops/template.py b/tensorflow/python/ops/template.py
index e7ad261615f..7c2d3be3387 100644
--- a/tensorflow/python/ops/template.py
+++ b/tensorflow/python/ops/template.py
@@ -37,7 +37,7 @@ from tensorflow.python.util.tf_export import tf_export
 __all__ = ["make_template"]
 
 
-@tf_export("make_template")
+@tf_export(v1=["make_template"])
 def make_template(name_, func_, create_scope_now_=False, unique_name_=None,
                   custom_getter_=None, **kwargs):
   """Given an arbitrary function, wrap it so that it does variable sharing.
diff --git a/tensorflow/python/ops/variable_scope.py b/tensorflow/python/ops/variable_scope.py
index 41a8f57642e..fe93bfb61f7 100644
--- a/tensorflow/python/ops/variable_scope.py
+++ b/tensorflow/python/ops/variable_scope.py
@@ -437,37 +437,43 @@ class _VariableStore(object):
           raise ValueError(
               "Partitioner must be callable, but received: %s" % partitioner)
         with ops.name_scope(None):
-          return self._get_partitioned_variable(name=name,
-                                                shape=shape,
-                                                dtype=dtype,
-                                                initializer=initializer,
-                                                regularizer=regularizer,
-                                                reuse=reuse,
-                                                trainable=trainable,
-                                                collections=collections,
-                                                caching_device=caching_device,
-                                                partitioner=partitioner,
-                                                validate_shape=validate_shape,
-                                                use_resource=use_resource,
-                                                constraint=constraint)
+          return self._get_partitioned_variable(
+              name=name,
+              shape=shape,
+              dtype=dtype,
+              initializer=initializer,
+              regularizer=regularizer,
+              reuse=reuse,
+              trainable=trainable,
+              collections=collections,
+              caching_device=caching_device,
+              partitioner=partitioner,
+              validate_shape=validate_shape,
+              use_resource=use_resource,
+              constraint=constraint,
+              synchronization=synchronization,
+              aggregation=aggregation)
 
       # Special case for partitioned variable to allow reuse without having to
       # specify partitioner.
       if (reuse is True and partitioner is None
           and name in self._partitioned_vars):
-        return self._get_partitioned_variable(name=name,
-                                              shape=shape,
-                                              dtype=dtype,
-                                              initializer=initializer,
-                                              regularizer=regularizer,
-                                              reuse=reuse,
-                                              trainable=trainable,
-                                              collections=collections,
-                                              caching_device=caching_device,
-                                              partitioner=None,
-                                              validate_shape=validate_shape,
-                                              use_resource=use_resource,
-                                              constraint=constraint)
+        return self._get_partitioned_variable(
+            name=name,
+            shape=shape,
+            dtype=dtype,
+            initializer=initializer,
+            regularizer=regularizer,
+            reuse=reuse,
+            trainable=trainable,
+            collections=collections,
+            caching_device=caching_device,
+            partitioner=None,
+            validate_shape=validate_shape,
+            use_resource=use_resource,
+            constraint=constraint,
+            synchronization=synchronization,
+            aggregation=aggregation)
 
       # Single variable case
       if "%s/part_0" % name in self._vars:
@@ -553,7 +559,9 @@ class _VariableStore(object):
                                 caching_device=None,
                                 validate_shape=True,
                                 use_resource=None,
-                                constraint=None):
+                                constraint=None,
+                                synchronization=VariableSynchronization.AUTO,
+                                aggregation=VariableAggregation.NONE):
     """Gets or creates a sharded variable list with these parameters.
 
     The `partitioner` must be a callable that accepts a fully defined
@@ -619,6 +627,15 @@ class _VariableStore(object):
         variable and return the Tensor for the projected value
         (which must have the same shape). Constraints are not safe to
         use when doing asynchronous distributed training.
+      synchronization: Indicates when a distributed a variable will be
+        aggregated. Accepted values are constants defined in the class
+        `tf.VariableSynchronization`. By default the synchronization is set to
+        `AUTO` and the current `DistributionStrategy` chooses
+        when to synchronize. If `synchronization` is set to `ON_READ`,
+        `trainable` must not be set to `True`.
+      aggregation: Indicates how a distributed variable will be aggregated.
+        Accepted values are constants defined in the class
+        `tf.VariableAggregation`.
 
     Returns:
       A `PartitionedVariable` object.
@@ -776,7 +793,9 @@ class _VariableStore(object):
             caching_device=caching_device,
             validate_shape=validate_shape,
             use_resource=use_resource,
-            constraint=constraint)
+            constraint=constraint,
+            synchronization=synchronization,
+            aggregation=aggregation)
 
       # pylint: disable=protected-access
       var._set_save_slice_info(variables.Variable.SaveSliceInfo(
@@ -1254,7 +1273,9 @@ class VariableScope(object):
                                 partitioner=None,
                                 validate_shape=True,
                                 use_resource=None,
-                                constraint=None):
+                                constraint=None,
+                                synchronization=VariableSynchronization.AUTO,
+                                aggregation=VariableAggregation.NONE):
     """Gets an existing variable with this name or create a new one."""
     if context.executing_eagerly():
       raise NotImplementedError("Partitioned variables are not yet supported "
@@ -1300,11 +1321,21 @@ class VariableScope(object):
     with ops.name_scope(None):
       # pylint: disable=protected-access
       return var_store._get_partitioned_variable(
-          full_name, shape=shape, dtype=dtype, initializer=initializer,
-          regularizer=regularizer, reuse=self.reuse, trainable=trainable,
-          collections=collections, caching_device=caching_device,
-          partitioner=partitioner, validate_shape=validate_shape,
-          use_resource=use_resource, constraint=constraint)
+          full_name,
+          shape=shape,
+          dtype=dtype,
+          initializer=initializer,
+          regularizer=regularizer,
+          reuse=self.reuse,
+          trainable=trainable,
+          collections=collections,
+          caching_device=caching_device,
+          partitioner=partitioner,
+          validate_shape=validate_shape,
+          use_resource=use_resource,
+          constraint=constraint,
+          synchronization=synchronization,
+          aggregation=aggregation)
       # pylint: enable=protected-access
 
 
@@ -1661,7 +1692,9 @@ def _get_partitioned_variable(name,
                               partitioner=None,
                               validate_shape=True,
                               use_resource=None,
-                              constraint=None):
+                              constraint=None,
+                              synchronization=VariableSynchronization.AUTO,
+                              aggregation=VariableAggregation.NONE):
   """Gets or creates a sharded variable list with these parameters.
 
   The `partitioner` must be a callable that accepts a fully defined
@@ -1719,6 +1752,15 @@ def _get_partitioned_variable(name,
       variable and return the Tensor for the projected value
       (which must have the same shape). Constraints are not safe to
       use when doing asynchronous distributed training.
+    synchronization: Indicates when a distributed a variable will be
+      aggregated. Accepted values are constants defined in the class
+      `tf.VariableSynchronization`. By default the synchronization is set to
+      `AUTO` and the current `DistributionStrategy` chooses
+      when to synchronize. If `synchronization` is set to `ON_READ`,
+      `trainable` must not be set to `True`.
+    aggregation: Indicates how a distributed variable will be aggregated.
+      Accepted values are constants defined in the class
+      `tf.VariableAggregation`.
 
   Returns:
     A tuple `(shards, partitions)` where `shards` is the list of `Variable`
@@ -1740,11 +1782,21 @@ def _get_partitioned_variable(name,
         "If so, consider instead using get_variable with a non-empty "
         "partitioner parameter instead." % scope.custom_getter)
   return scope._get_partitioned_variable(
-      _get_default_variable_store(), name, shape=shape, dtype=dtype,
-      initializer=initializer, regularizer=regularizer, trainable=trainable,
-      collections=collections, caching_device=caching_device,
-      partitioner=partitioner, validate_shape=validate_shape,
-      use_resource=use_resource, constraint=constraint)
+      _get_default_variable_store(),
+      name,
+      shape=shape,
+      dtype=dtype,
+      initializer=initializer,
+      regularizer=regularizer,
+      trainable=trainable,
+      collections=collections,
+      caching_device=caching_device,
+      partitioner=partitioner,
+      validate_shape=validate_shape,
+      use_resource=use_resource,
+      constraint=constraint,
+      synchronization=synchronization,
+      aggregation=aggregation)
   # pylint: enable=protected-access
 
 
diff --git a/tensorflow/python/ops/while_v2.py b/tensorflow/python/ops/while_v2.py
index 254fae11f4b..7b0f0ed4fc8 100644
--- a/tensorflow/python/ops/while_v2.py
+++ b/tensorflow/python/ops/while_v2.py
@@ -31,6 +31,7 @@ from tensorflow.python.framework import function_def_to_graph
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_spec
+from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import control_flow_util
@@ -66,9 +67,7 @@ def while_loop(cond,
                maximum_iterations=None,
                name=None):
   """Like tf.while_loop, except emits a single While op."""
-  if _is_in_xla_context() and maximum_iterations is None:
-    raise ValueError("maximum_iterations is required in XLA context.")
-
+  maximum_iterations = _validate_and_convert_to_tensor(maximum_iterations)
   # Keep the original loop_vars around to know which args were TensorArrays.
   orig_loop_vars = loop_vars
   # Cache its length since we use it at multiple places below.
@@ -85,13 +84,6 @@ def while_loop(cond,
   else:
     shape_invariants = nest.map_structure(lambda t: t.shape, loop_vars)
 
-  if maximum_iterations is not None:
-    maximum_iterations = ops.convert_to_tensor(
-        maximum_iterations, name="maximum_iterations")
-    if maximum_iterations.shape.ndims != 0:
-      raise ValueError("maximum_iterations must be a scalar, saw shape: %s" %
-                       maximum_iterations.shape)
-
   if not name:
     name = "while"
 
@@ -214,11 +206,11 @@ def while_loop(cond,
     intermediate_tensors = _get_intermediates(body_graph)
 
     for intermediate_tensor in intermediate_tensors:
-      # TODO(srbs): Cache and re-use empty tensor lists.
       tensor_list = list_ops.empty_tensor_list(
           element_dtype=intermediate_tensor.dtype,
           element_shape=_get_tensor_convertible_shape(
-              intermediate_tensor.shape))
+              intermediate_tensor.shape),
+          max_num_elements=maximum_iterations)
       loop_vars.append(tensor_list)
       with cond_graph.as_default():
         # Add a placeholder to cond_graph's inputs corresponding to the
@@ -254,6 +246,7 @@ def while_loop(cond,
 
     _copy_handle_data(body_graph.outputs, outputs)
     _maybe_set_lowering_attr(outputs[0].op)
+    _maybe_set_maximum_iterations_attr(outputs[0].op, maximum_iterations)
 
     # Return identities for each output of the While op, rather than the output
     # of the While op directly. This makes pruning work if the output of
@@ -314,10 +307,15 @@ def _WhileGrad(op, *grads):  # pylint: disable=invalid-name
 
   intermediate_tensors = _get_intermediates(body_grad_graph)
 
+  maximum_iterations = op.get_attr(
+      "_maximum_iterations") if _is_in_xla_context() else None
+  assert not _is_in_xla_context() or maximum_iterations is not None
   for intermediate_tensor in intermediate_tensors:
     tensor_list = list_ops.empty_tensor_list(
         element_dtype=intermediate_tensor.dtype,
-        element_shape=_get_tensor_convertible_shape(intermediate_tensor.shape))
+        element_shape=_get_tensor_convertible_shape(intermediate_tensor.shape),
+        max_num_elements=maximum_iterations)
+
     with body_grad_graph.as_default():
       tensor_list_ph = body_grad_graph.capture(tensor_list, whitelisted=True)
       # Push the intermediate tensor to the tensor list.
@@ -346,6 +344,7 @@ def _WhileGrad(op, *grads):  # pylint: disable=invalid-name
 
   _copy_handle_data(body_grad_graph.outputs, outputs)
   _maybe_set_lowering_attr(outputs[0].op)
+  _maybe_set_maximum_iterations_attr(outputs[0].op, maximum_iterations)
 
   # Set None as the output gradient for tensors with None input gradient
   # e.g. TensorArray handles.
@@ -362,6 +361,46 @@ def _WhileGrad(op, *grads):  # pylint: disable=invalid-name
   return none_padded_outputs
 
 
+def _validate_and_convert_to_tensor(maximum_iterations):
+  """Checks that `maximum_iterations` is valid.
+
+  In XLA context, `maximum_iterations` is required and must be statically
+  inferable, e.g. output tensor of a Const node.
+
+  Args:
+    maximum_iterations: The maximum_iterations passed to while_loop.
+
+  Returns:
+    A scalar valued tensor of type int32 or None.
+
+  Raises:
+    ValueError: If `maximum_iterations` is invalid.
+  """
+  if _is_in_xla_context():
+    if maximum_iterations is None:
+      raise ValueError("maximum_iterations is None. It is required and must "
+                       "be statically known (e.g. a constant value or known "
+                       "shape dimension) when building while_loop in XLA "
+                       "context.")
+    if isinstance(maximum_iterations, ops.Tensor):
+      # Get the constant value from the `maximum_iterations` tensor to avoid
+      # capturing a Const tensor from outside this graph.
+      maximum_iterations = tensor_util.constant_value(maximum_iterations)
+      if maximum_iterations is None:
+        raise ValueError("maximum_iterations must be statically known (e.g. a "
+                         "constant value or known shape dimension) when "
+                         "building while_loop in XLA context.")
+
+  if maximum_iterations is not None:
+    # EmptyTensorList expects `max_num_elements` to be of type int32.
+    maximum_iterations = ops.convert_to_tensor(
+        maximum_iterations, dtype=dtypes.int32, name="maximum_iterations")
+    if maximum_iterations.shape.ndims != 0:
+      raise ValueError("maximum_iterations must be a scalar, saw shape: %s" %
+                       maximum_iterations.shape)
+  return maximum_iterations
+
+
 # TODO(srbs): Pull this into common utils for cond_v2 and while_v2.
 def _get_body_graph(while_op):
   """Returns `FuncGraph` for the while body.
@@ -776,11 +815,25 @@ def _maybe_set_lowering_attr(op):
     # pylint: enable=protected-access
 
 
+def _maybe_set_maximum_iterations_attr(op, maximum_iterations):
+  if control_flow_util.IsInXLAContext(op):
+    # Store the maximum_iterations to use in the gradient pass.
+    op._set_attr(  # pylint: disable=protected-access
+        "_maximum_iterations",
+        attr_value_pb2.AttrValue(
+            i=tensor_util.constant_value(maximum_iterations)))
+
+
 # TODO(srbs): This method should be in control_flow_util but that introduces
 # a circular dependency ops -> control_flow_util -> ops.
 def _is_in_xla_context():
   """Returns whether the current context is inside an XLA context."""
-  cur_ctxt = ops.get_default_graph()._get_control_flow_context()  # pylint: disable=protected-access
+  outer_graph = ops.get_default_graph()
+  # The `_control_flow_context` is not copied when building a FuncGraph so
+  # we look it up from the base graph.
+  while isinstance(outer_graph, func_graph_module.FuncGraph):
+    outer_graph = outer_graph.outer_graph
+  cur_ctxt = outer_graph._get_control_flow_context()  # pylint: disable=protected-access
   return control_flow_util.GetContainingXLAContext(cur_ctxt) is not None
 
 
diff --git a/tensorflow/python/platform/resource_loader.py b/tensorflow/python/platform/resource_loader.py
index b2d95518552..8f4c5c190cc 100644
--- a/tensorflow/python/platform/resource_loader.py
+++ b/tensorflow/python/platform/resource_loader.py
@@ -24,7 +24,7 @@ from tensorflow.python.util import tf_inspect as _inspect
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export('resource_loader.load_resource')
+@tf_export(v1=['resource_loader.load_resource'])
 def load_resource(path):
   """Load the resource at given path, where path is relative to tensorflow/.
 
@@ -46,7 +46,7 @@ def load_resource(path):
 
 
 # pylint: disable=protected-access
-@tf_export('resource_loader.get_data_files_path')
+@tf_export(v1=['resource_loader.get_data_files_path'])
 def get_data_files_path():
   """Get a direct path to the data files colocated with the script.
 
@@ -57,7 +57,7 @@ def get_data_files_path():
   return _os.path.dirname(_inspect.getfile(_sys._getframe(1)))
 
 
-@tf_export('resource_loader.get_root_dir_with_all_resources')
+@tf_export(v1=['resource_loader.get_root_dir_with_all_resources'])
 def get_root_dir_with_all_resources():
   """Get a root directory containing all the data attributes in the build rule.
 
@@ -97,7 +97,7 @@ def get_root_dir_with_all_resources():
   return data_files_dir or script_dir
 
 
-@tf_export('resource_loader.get_path_to_datafile')
+@tf_export(v1=['resource_loader.get_path_to_datafile'])
 def get_path_to_datafile(path):
   """Get the path to the specified file in the data dependencies.
 
@@ -117,7 +117,7 @@ def get_path_to_datafile(path):
   return _os.path.join(data_files_path, path)
 
 
-@tf_export('resource_loader.readahead_file_path')
+@tf_export(v1=['resource_loader.readahead_file_path'])
 def readahead_file_path(path, readahead='128M'):  # pylint: disable=unused-argument
   """Readahead files not implemented; simply returns given path."""
   return path
diff --git a/tensorflow/python/platform/test.py b/tensorflow/python/platform/test.py
index 5dc4037d62b..d084870b255 100644
--- a/tensorflow/python/platform/test.py
+++ b/tensorflow/python/platform/test.py
@@ -64,7 +64,7 @@ def main(argv=None):
   return _googletest.main(argv)
 
 
-@tf_export('test.get_temp_dir')
+@tf_export(v1=['test.get_temp_dir'])
 def get_temp_dir():
   """Returns a temporary directory for use during tests.
 
@@ -76,7 +76,7 @@ def get_temp_dir():
   return _googletest.GetTempDir()
 
 
-@tf_export('test.test_src_dir_path')
+@tf_export(v1=['test.test_src_dir_path'])
 def test_src_dir_path(relative_path):
   """Creates an absolute test srcdir path given a relative path.
 
diff --git a/tensorflow/python/profiler/model_analyzer.py b/tensorflow/python/profiler/model_analyzer.py
index acf02096fff..5f19eac0436 100644
--- a/tensorflow/python/profiler/model_analyzer.py
+++ b/tensorflow/python/profiler/model_analyzer.py
@@ -398,7 +398,7 @@ def advise(graph=None, run_meta=None, options=_DEFAULT_ADVISE_OPTIONS):
   Returns:
     Returns AdviceProto proto
   """
-  if not graph and context.in_eager_execution():
+  if not graph and not context.executing_eagerly():
     graph = ops.get_default_graph()
 
   if options == _DEFAULT_ADVISE_OPTIONS:
diff --git a/tensorflow/python/saved_model/BUILD b/tensorflow/python/saved_model/BUILD
index 576ad8ed65c..e7a3b8afd5d 100644
--- a/tensorflow/python/saved_model/BUILD
+++ b/tensorflow/python/saved_model/BUILD
@@ -21,9 +21,9 @@ py_library(
     deps = [
         ":builder",
         ":constants",
-        ":export",
         ":loader",
         ":main_op",
+        ":save",
         ":signature_constants",
         ":signature_def_utils",
         ":simple_save",
@@ -265,9 +265,9 @@ py_test(
 )
 
 py_library(
-    name = "export",
+    name = "save",
     srcs = [
-        "export.py",
+        "save.py",
     ],
     srcs_version = "PY2AND3",
     deps = [
@@ -285,11 +285,11 @@ py_library(
 )
 
 py_test(
-    name = "export_test",
-    srcs = ["export_test.py"],
+    name = "save_test",
+    srcs = ["save_test.py"],
     srcs_version = "PY2AND3",
     deps = [
-        ":export",
+        ":save",
         ":signature_constants",
         ":tag_constants",
         "//tensorflow/python/eager:def_function",
diff --git a/tensorflow/python/saved_model/export.py b/tensorflow/python/saved_model/save.py
similarity index 81%
rename from tensorflow/python/saved_model/export.py
rename to tensorflow/python/saved_model/save.py
index 030182ca4bc..63575f631eb 100644
--- a/tensorflow/python/saved_model/export.py
+++ b/tensorflow/python/saved_model/save.py
@@ -25,6 +25,7 @@ from tensorflow.core.protobuf import saved_model_pb2
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.eager import function
+from tensorflow.python.framework import meta_graph
 from tensorflow.python.framework import ops
 from tensorflow.python.lib.io import file_io
 from tensorflow.python.ops import array_ops
@@ -37,6 +38,7 @@ from tensorflow.python.training.checkpointable import base
 from tensorflow.python.training.checkpointable import util
 from tensorflow.python.util import compat
 from tensorflow.python.util import nest
+from tensorflow.python.util.tf_export import tf_export
 
 
 def _find_function_to_export(root):
@@ -51,7 +53,7 @@ def _find_function_to_export(root):
   # TODO(allenl): Automatically infer signatures for Keras functional models?
   if not functions:
     raise ValueError(
-        ("Exporting an object with no tf.saved_model_save(..., signatures=...) "
+        ("Exporting an object with no tf.saved_model.save(..., signatures=...) "
          "argument specified, and with no @tf.function-decorated methods "
          "attached to it. In the future this will be a supported use-case for "
          "Python re-import, but at the moment saving a SavedModel without "
@@ -60,7 +62,7 @@ def _find_function_to_export(root):
          "explicitly."))
   elif len(functions) > 1:
     raise ValueError(
-        ("Exporting an object with no tf.saved_model_save(..., signatures=...) "
+        ("Exporting an object with no tf.saved_model.save(..., signatures=...) "
          "argument specified, and with more than one @tf.function-decorated "
          "method attached to it: {}. The signature keys for these functions "
          "are ambiguous. Specify signature functions explicitly.").format(
@@ -370,46 +372,66 @@ def _make_graph_def(root, signature_functions, object_saver):
   return graph_def, signatures, saver_def
 
 
-def export(obj, export_dir, signatures=None):
+@tf_export("saved_model.save", v1=["saved_model.experimental.save"])
+def save(obj, export_dir, signatures=None):
   # pylint: disable=line-too-long
   """Exports the Checkpointable object `obj` to [SavedModel format](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/saved_model/README.md).
 
-  The `signatures` argument indicates TensorFlow functions which will be
+  Example usage:
+
+  ```python
+  class Adder(tf.train.Checkpoint):
+
+    @tf.function(input_signature=[tf.TensorSpec(shape=None, dtype=tf.float32)])
+    def add(self, x):
+      return x + x + 1.
+
+  to_export = Adder()
+  tf.saved_model.save(to_export, '/tmp/adder')
+  ```
+
+  The resulting SavedModel is then servable with an input named "x", its value
+  having any shape and dtype float32.
+
+  The optional `signatures` argument controls which methods in `obj` will be
   available to programs which consume `SavedModel`s, for example serving
   APIs. Python functions may be decorated with
   `@tf.function(input_signature=...)` and passed as signatures directly, or
-  created without a signature using `@tf.function` and then converted to a
-  concrete TensorFlow function using `f.get_concrete_function(...)`.
+  lazily with a call to `get_concrete_function` on the method decorated with
+  `@tf.function`.
 
-  In either case, `Tensor` inputs to `signatures` functions which are not
-  associated with a unique Python argument name must have names explicitly
-  specified in their `tf.TensorSpec` objects. Cases where this is necessary
-  include positional arguments passed through variadic `*args` and multiple
-  `Tensor` inputs which are part of the same nested structure.
+  If the `signatures` argument is omitted, `obj` will be searched for
+  `@tf.function`-decorated methods. If exactly one `@tf.function` is found, that
+  method will be used as the default signature for the SavedModel. This behavior
+  is expected to change in the future, when a corresponding
+  `tf.saved_model.load` symbol is added. At that point signatures will be
+  completely optional, and any `@tf.function` attached to `obj` or its
+  dependencies will be exported for use with `load`.
+
+  When invoking a signature in an exported SavedModel, `Tensor` arguments are
+  identified by name. These names will come from the Python function's argument
+  names by default. They may be overridden by specifying a `name=...` argument
+  in the corresponding `tf.TensorSpec` object. Explicit naming is required if
+  multiple `Tensor`s are passed through a single argument to the Python
+  function.
 
   The outputs of functions used as `signatures` must either be flat lists, in
   which case outputs will be numbered, or a dictionary mapping string keys to
-  Tensors, in which case the string keys will be used to name outputs.
+  `Tensor`, in which case the keys will be used to name outputs.
 
-  Exporting with a signature specified:
+  Since `tf.keras.Model` objects are also Checkpointable, this function can be
+  used to export Keras models. For example, exporting with a signature
+  specified:
 
   ```python
   class Model(tf.keras.Model):
 
-    @tf.function(input_signature=tf.TensorSpec(shape=[None], dtype=tf.string))
-    def serve(serialized):
+    @tf.function(input_signature=[tf.TensorSpec(shape=[None], dtype=tf.string)])
+    def serve(self, serialized):
       ...
 
   m = Model()
-  tf.saved_model.export(m, '/tmp/saved_model/', signatures=m.serve)
-  ```
-
-  The `signatures` argument may be omitted if only one method of the exported
-  object is decorated with `tf.function` and that method has an input signature
-  specified.
-
-  ```python
-  tf.saved_model.export(m, '/tmp/saved_model/')
+  tf.saved_model.save(m, '/tmp/saved_model/')
   ```
 
   Exporting from a function without a fixed signature:
@@ -418,13 +440,13 @@ def export(obj, export_dir, signatures=None):
   class Model(tf.keras.Model):
 
     @tf.function
-    def compute(x):
+    def call(self, x):
       ...
 
   m = Model()
-  tf.saved_model.export(
+  tf.saved_model.save(
       m, '/tmp/saved_model/',
-      signatures=m.compute.get_concrete_function(
+      signatures=m.call.get_concrete_function(
           tf.TensorSpec(shape=[None, 3], dtype=tf.float32, name="inp")))
   ```
 
@@ -434,14 +456,47 @@ def export(obj, export_dir, signatures=None):
   automatically. This is the same tracking scheme that `tf.train.Checkpoint`
   uses, and an exported `Checkpoint` object may be restored as a training
   checkpoint by pointing `tf.train.Checkpoint.restore` to the SavedModel's
-  "variables/" subdirectory.
+  "variables/" subdirectory. Currently variables are the only stateful objects
+  supported by `tf.saved_model.save`, but others (e.g. tables) will be supported
+  in the future.
+
+  `tf.function` does not hard-code device annotations from outside the function
+  body, instead using the calling context's device. This means for example that
+  exporting a model which runs on a GPU and serving it on a CPU will generally
+  work, with some exceptions. `tf.device` annotations inside the body of the
+  function will be hard-coded in the exported model; this type of annotation is
+  discouraged. Device-specific operations, e.g. with "cuDNN" in the name or with
+  device-specific layouts, may cause issues. Currently a `DistributionStrategy`
+  is another exception: active distribution strategies will cause device
+  placements to be hard-coded in a function. Exporting a single-device
+  computation and importing under a `DistributionStrategy` is not currently
+  supported, but may be in the future.
+
+  SavedModels exported with `tf.saved_model.save` [strip default-valued
+  attributes](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/saved_model/README.md#stripping-default-valued-attributes)
+  automatically, which removes one source of incompatibilities when the consumer
+  of a SavedModel is running an older TensorFlow version than the
+  producer. There are however other sources of incompatibilities which are not
+  handled automatically, such as when the exported model contains operations
+  which the consumer does not have definitions for.
+
+  The current implementation of `tf.saved_model.save` targets serving use-cases,
+  but omits information which will be necessary for the planned future
+  implementation of `tf.saved_model.load`. Exported models using the current
+  `save` implementation, and other existing SavedModels, will not be compatible
+  with `tf.saved_model.load` when it is implemented. Further, `save` will in the
+  future attempt to export `@tf.function`-decorated methods which it does not
+  currently inspect, so some objects which are exportable today will raise
+  exceptions on export in the future (e.g. due to complex/non-serializable
+  default arguments). Such backwards-incompatible API changes are expected only
+  prior to the TensorFlow 2.0 release.
 
   Args:
     obj: A checkpointable object to export.
     export_dir: A directory in which to write the SavedModel.
     signatures: Optional, either a `tf.function` with an input signature
       specified or the result of `f.get_concrete_function` on a
-      `tf.function`-decorated function `f`, in which case `f` will be used to
+      `@tf.function`-decorated function `f`, in which case `f` will be used to
       generate a signature for the SavedModel under the default serving
       signature key. `signatures` may also be a dictionary, in which case it
       maps from signature keys to either `tf.function` instances with input
@@ -456,12 +511,14 @@ def export(obj, export_dir, signatures=None):
   if not isinstance(obj, base.CheckpointableBase):
     raise ValueError(
         "Expected a Checkpointable object for export, got {}.".format(obj))
+  if signatures is None:
+    # Note that we run this before saving the checkpoint, since looping over
+    # attributes may have the side effect of creating variables in some cases.
+    signatures = _find_function_to_export(obj)
   object_saver = util.CheckpointableSaver(obj)
   utils_impl.get_or_create_variables_dir(export_dir)
   object_saver.save(utils_impl.get_variables_path(export_dir))
 
-  if signatures is None:
-    signatures = _find_function_to_export(obj)
   signatures = _canonicalize_signatures(signatures)
   graph_def, signatures, saver_def = _make_graph_def(
       obj, signatures, object_saver)
@@ -476,6 +533,7 @@ def export(obj, export_dir, signatures=None):
   meta_graph_def.graph_def.MergeFrom(graph_def)
   for signature_key, signature in signatures.items():
     meta_graph_def.signature_def[signature_key].MergeFrom(signature)
+  meta_graph.strip_graph_default_valued_attrs(meta_graph_def)
   path = os.path.join(
       compat.as_bytes(export_dir),
       compat.as_bytes(constants.SAVED_MODEL_FILENAME_PB))
diff --git a/tensorflow/python/saved_model/export_test.py b/tensorflow/python/saved_model/save_test.py
similarity index 66%
rename from tensorflow/python/saved_model/export_test.py
rename to tensorflow/python/saved_model/save_test.py
index d70e53a26f0..42ff508b38a 100644
--- a/tensorflow/python/saved_model/export_test.py
+++ b/tensorflow/python/saved_model/save_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for checkpointable object SavedModel export."""
+"""Tests for checkpointable object SavedModel save."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -33,11 +33,12 @@ from tensorflow.python.keras.engine import training
 from tensorflow.python.keras.layers import core
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variables
-from tensorflow.python.saved_model import export
 from tensorflow.python.saved_model import loader
+from tensorflow.python.saved_model import save
 from tensorflow.python.saved_model import signature_constants
 from tensorflow.python.training import adam
 from tensorflow.python.training.checkpointable import tracking
+from tensorflow.python.training.checkpointable import util
 
 
 class _ModelWithOptimizer(training.Model):
@@ -59,15 +60,15 @@ class _ModelWithOptimizer(training.Model):
     return {"loss": loss}
 
 
-class ExportTest(test.TestCase):
+class SaveTest(test.TestCase):
 
   def _import_and_infer(
-      self, export_dir, inputs,
+      self, save_dir, inputs,
       signature_key=signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY):
     """Import a SavedModel into a TF 1.x-style graph and run `signature_key`."""
     graph = ops.Graph()
     with graph.as_default(), self.session(graph) as session:
-      model = loader.load(session, [], export_dir)
+      model = loader.load(session, [], save_dir)
       signature = model.signature_def[signature_key]
       self.assertEqual(set(inputs.keys()), set(signature.inputs.keys()))
       feed_dict = {}
@@ -80,42 +81,42 @@ class ExportTest(test.TestCase):
             output_tensor_info.name)
       return session.run(output_dict, feed_dict=feed_dict)
 
-  def test_method_export_signature(self):
+  def test_method_save_signature(self):
     root = tracking.Checkpointable()
     root.f = def_function.function(
         lambda x: 2. * x,
         input_signature=[tensor_spec.TensorSpec(None, dtypes.float32)])
     root.f(constant_op.constant(1.))
-    export_dir = os.path.join(self.get_temp_dir(), "saved_model")
-    export.export(root, export_dir, root.f)
+    save_dir = os.path.join(self.get_temp_dir(), "saved_model")
+    save.save(root, save_dir, root.f)
     self.assertEqual(
         {"output_0": 2.},
-        self._import_and_infer(export_dir, {"x": 1.}))
+        self._import_and_infer(save_dir, {"x": 1.}))
 
-  def test_method_export_concrete(self):
+  def test_method_save_concrete(self):
     root = tracking.Checkpointable()
     root.f = def_function.function(
         lambda z: {"out": 2. * z})
     root.f(constant_op.constant(1.))
-    export_dir = os.path.join(self.get_temp_dir(), "saved_model")
-    export.export(
+    save_dir = os.path.join(self.get_temp_dir(), "saved_model")
+    save.save(
         root,
-        export_dir,
+        save_dir,
         {"non_default_key": root.f.get_concrete_function(
             tensor_spec.TensorSpec(None, dtypes.float32))})
     self.assertEqual(
         {"out": 2.},
         self._import_and_infer(
-            export_dir, {"z": 1.}, signature_key="non_default_key"))
+            save_dir, {"z": 1.}, signature_key="non_default_key"))
 
   def test_non_concrete_error(self):
     root = tracking.Checkpointable()
     root.f = def_function.function(lambda x: 2. * x)
     root.f(constant_op.constant(1.))
-    export_dir = os.path.join(self.get_temp_dir(), "saved_model")
+    save_dir = os.path.join(self.get_temp_dir(), "saved_model")
     with self.assertRaisesRegexp(
         ValueError, "must be converted to concrete functions"):
-      export.export(root, export_dir, root.f)
+      save.save(root, save_dir, root.f)
 
   def test_nested_inputs(self):
     root = tracking.Checkpointable()
@@ -124,7 +125,7 @@ class ExportTest(test.TestCase):
         input_signature=([tensor_spec.TensorSpec(None, dtypes.float32),
                           tensor_spec.TensorSpec(None, dtypes.float32)],))
     root.f([constant_op.constant(1.), constant_op.constant(1.)])
-    # Concrete functions must always have uniquely named Tensor inputs. Export
+    # Concrete functions must always have uniquely named Tensor inputs. Save
     # relies on this.
     with self.assertRaisesRegexp(
         ValueError, "two arguments named 'x'"):
@@ -134,22 +135,22 @@ class ExportTest(test.TestCase):
     root = tracking.Checkpointable()
     root.f = def_function.function(lambda x: (2. * x, (3. * x, 4. * x)))
     root.f(constant_op.constant(1.))
-    to_export = root.f.get_concrete_function(constant_op.constant(1.))
-    export_dir = os.path.join(self.get_temp_dir(), "saved_model")
+    to_save = root.f.get_concrete_function(constant_op.constant(1.))
+    save_dir = os.path.join(self.get_temp_dir(), "saved_model")
     with self.assertRaisesRegexp(
         ValueError, "non-flat outputs"):
-      export.export(root, export_dir, to_export)
+      save.save(root, save_dir, to_save)
 
   def test_nested_dict_outputs(self):
     root = tracking.Checkpointable()
     root.f = def_function.function(
         lambda x: {"a": 2. * x, "b": (3. * x, 4. * x)})
     root.f(constant_op.constant(1.))
-    to_export = root.f.get_concrete_function(constant_op.constant(1.))
-    export_dir = os.path.join(self.get_temp_dir(), "saved_model")
+    to_save = root.f.get_concrete_function(constant_op.constant(1.))
+    save_dir = os.path.join(self.get_temp_dir(), "saved_model")
     with self.assertRaisesRegexp(
         ValueError, "dictionary containing non-Tensor value"):
-      export.export(root, export_dir, to_export)
+      save.save(root, save_dir, to_save)
 
   def test_variable(self):
     root = tracking.Checkpointable()
@@ -158,49 +159,49 @@ class ExportTest(test.TestCase):
     root.f = def_function.function(
         lambda x: root.v1 * root.v2 * x)
     root.f(constant_op.constant(1.))
-    to_export = root.f.get_concrete_function(constant_op.constant(1.))
-    export_dir = os.path.join(self.get_temp_dir(), "saved_model")
-    export.export(root, export_dir, to_export)
+    to_save = root.f.get_concrete_function(constant_op.constant(1.))
+    save_dir = os.path.join(self.get_temp_dir(), "saved_model")
+    save.save(root, save_dir, to_save)
     self.assertAllEqual({"output_0": 12.},
-                        self._import_and_infer(export_dir, {"x": 2.}))
+                        self._import_and_infer(save_dir, {"x": 2.}))
 
   def test_optimizer(self):
     x = constant_op.constant([[3., 4.]])
     y = constant_op.constant([2.])
     model = _ModelWithOptimizer()
     first_loss = model(x, y)
-    export_dir = os.path.join(self.get_temp_dir(), "saved_model")
-    export.export(model, export_dir, model.call)
+    save_dir = os.path.join(self.get_temp_dir(), "saved_model")
+    save.save(model, save_dir, model.call)
     second_loss = model(x, y)
     self.assertNotEqual(first_loss, second_loss)
     self.assertAllClose(
         second_loss,
-        self._import_and_infer(export_dir, {"x": [[3., 4.]], "y": [2.]}))
+        self._import_and_infer(save_dir, {"x": [[3., 4.]], "y": [2.]}))
 
-  def test_trivial_export_exception(self):
-    export_dir = os.path.join(self.get_temp_dir(), "saved_model")
+  def test_trivial_save_exception(self):
+    save_dir = os.path.join(self.get_temp_dir(), "saved_model")
     with self.assertRaisesRegexp(ValueError, "signature"):
-      export.export(tracking.Checkpointable(), export_dir)
+      save.save(tracking.Checkpointable(), save_dir)
 
   def test_single_method_default_signature(self):
     model = _ModelWithOptimizer()
     x = constant_op.constant([[3., 4.]])
     y = constant_op.constant([2.])
     model(x, y)
-    export_dir = os.path.join(self.get_temp_dir(), "saved_model")
-    export.export(model, export_dir)
+    save_dir = os.path.join(self.get_temp_dir(), "saved_model")
+    save.save(model, save_dir)
     self.assertIn("loss",
-                  self._import_and_infer(export_dir,
+                  self._import_and_infer(save_dir,
                                          {"x": [[3., 4.]], "y": [2.]}))
 
   def test_single_function_default_signature(self):
     model = tracking.Checkpointable()
     model.f = def_function.function(lambda: 3., input_signature=())
     model.f()
-    export_dir = os.path.join(self.get_temp_dir(), "saved_model")
-    export.export(model, export_dir)
+    save_dir = os.path.join(self.get_temp_dir(), "saved_model")
+    save.save(model, save_dir)
     self.assertAllClose({"output_0": 3.},
-                        self._import_and_infer(export_dir, {}))
+                        self._import_and_infer(save_dir, {}))
 
   def test_ambiguous_signatures(self):
     model = _ModelWithOptimizer()
@@ -208,9 +209,49 @@ class ExportTest(test.TestCase):
     y = constant_op.constant([2.])
     model(x, y)
     model.second_function = def_function.function(lambda: 1.)
-    export_dir = os.path.join(self.get_temp_dir(), "saved_model")
+    save_dir = os.path.join(self.get_temp_dir(), "saved_model")
     with self.assertRaisesRegexp(ValueError, "call.*second_function"):
-      export.export(model, export_dir)
+      save.save(model, save_dir)
+
+  def test_docstring(self):
+
+    class Adder(util.Checkpoint):
+
+      @def_function.function(input_signature=[tensor_spec.TensorSpec(
+          shape=None, dtype=dtypes.float32)])
+      def add(self, x):
+        return x + x + 1.
+
+    to_save = Adder()
+    to_save.add(constant_op.constant(1.))
+    save_dir = os.path.join(self.get_temp_dir(), "saved_model")
+    save.save(to_save, save_dir)
+    self.assertAllClose({"output_0": 7.},
+                        self._import_and_infer(save_dir, {"x": 3.}))
+
+  def test_default_attr_stripping(self):
+
+    class Complex(util.Checkpoint):
+
+      @def_function.function(input_signature=[])
+      def __call__(self):
+        return math_ops.complex(
+            constant_op.constant(1.),
+            constant_op.constant(2.),
+            name="complex")
+
+    to_save = Complex()
+    to_save()
+    save_dir = os.path.join(self.get_temp_dir(), "saved_model")
+    save.save(to_save, save_dir)
+    graph = ops.Graph()
+    with graph.as_default(), self.session(graph) as session:
+      loader.load(session, [], save_dir)
+      func, = graph._functions.values()
+      complex_node, = [
+          node for node in func.definition.node_def if node.op == "Complex"]
+      self.assertNotIn("T", complex_node.attr)
+      self.assertNotIn("Tout", complex_node.attr)
 
 
 class MemoryTests(test.TestCase):
@@ -227,8 +268,8 @@ class MemoryTests(test.TestCase):
       # TODO(allenl): debug reference cycles in Python 2.x
       self.skipTest("This test only works in Python 3+. Reference cycles are "
                     "created in older Python versions.")
-    export_dir = os.path.join(self.get_temp_dir(), "saved_model")
-    export.export(self._model, export_dir, self._model.call)
+    save_dir = os.path.join(self.get_temp_dir(), "saved_model")
+    save.save(self._model, save_dir, self._model.call)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/saved_model/saved_model.py b/tensorflow/python/saved_model/saved_model.py
index 6702c996071..fcde6b47e4f 100644
--- a/tensorflow/python/saved_model/saved_model.py
+++ b/tensorflow/python/saved_model/saved_model.py
@@ -29,8 +29,8 @@ from tensorflow.python.saved_model import signature_constants
 from tensorflow.python.saved_model import signature_def_utils
 from tensorflow.python.saved_model import tag_constants
 from tensorflow.python.saved_model import utils
+from tensorflow.python.saved_model.save import save
 # pylint: enable=unused-import
 # pylint: disable=wildcard-import
 from tensorflow.python.saved_model.simple_save import *
 # pylint: enable=wildcard-import
-
diff --git a/tensorflow/python/saved_model/utils.py b/tensorflow/python/saved_model/utils.py
index 27c35549093..9bd0126ae3a 100644
--- a/tensorflow/python/saved_model/utils.py
+++ b/tensorflow/python/saved_model/utils.py
@@ -22,5 +22,6 @@ from __future__ import print_function
 
 # pylint: disable=unused-import
 from tensorflow.python.saved_model.utils_impl import build_tensor_info
+from tensorflow.python.saved_model.utils_impl import build_tensor_info_from_op
 from tensorflow.python.saved_model.utils_impl import get_tensor_from_tensor_info
 # pylint: enable=unused-import
diff --git a/tensorflow/python/saved_model/utils_impl.py b/tensorflow/python/saved_model/utils_impl.py
index 2ee4d9f4e04..10667419761 100644
--- a/tensorflow/python/saved_model/utils_impl.py
+++ b/tensorflow/python/saved_model/utils_impl.py
@@ -20,10 +20,12 @@ from __future__ import print_function
 
 import os
 
+from tensorflow.core.framework import types_pb2
 from tensorflow.core.protobuf import meta_graph_pb2
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.lib.io import file_io
 from tensorflow.python.saved_model import constants
 from tensorflow.python.util import compat
@@ -42,7 +44,7 @@ from tensorflow.python.util.tf_export import tf_export
     "library as tf.compat.v1.saved_model.utils.build_tensor_info or "
     "tf.compat.v1.saved_model.build_tensor_info.")
 def build_tensor_info(tensor):
-  """Utility function to build TensorInfo proto.
+  """Utility function to build TensorInfo proto from a Tensor.
 
   Args:
     tensor: Tensor or SparseTensor whose name, dtype and shape are used to
@@ -64,6 +66,41 @@ def build_tensor_info(tensor):
   return tensor_info
 
 
+def build_tensor_info_from_op(op):
+  """Utility function to build TensorInfo proto from an Op.
+
+  Note that this function should be used with caution. It is strictly restricted
+  to TensorFlow internal use-cases only. Please make sure you do need it before
+  using it.
+
+  This utility function overloads the TensorInfo proto by setting the name to
+  the Op's name, dtype to DT_INVALID and tensor_shape as None. One typical usage
+  is for the Op of the call site for the defunned function:
+  ```python
+    @function.defun
+    def some_vairable_initialiation_fn(value_a, value_b):
+      a = value_a
+      b = value_b
+
+    value_a = constant_op.constant(1, name="a")
+    value_b = constant_op.constant(2, name="b")
+    op_info = utils.build_op_info(
+        some_vairable_initialiation_fn(value_a, value_b))
+  ```
+
+  Args:
+    op: An Op whose name is used to build the TensorInfo. The name that points
+        to the Op could be fetched at run time in the Loader session.
+
+  Returns:
+    A TensorInfo protocol buffer constructed based on the supplied argument.
+  """
+  return meta_graph_pb2.TensorInfo(
+      dtype=types_pb2.DT_INVALID,
+      tensor_shape=tensor_shape.unknown_shape().as_proto(),
+      name=op.name)
+
+
 @tf_export(v1=["saved_model.get_tensor_from_tensor_info",
                "saved_model.utils.get_tensor_from_tensor_info"])
 @deprecation.deprecated(
diff --git a/tensorflow/python/saved_model/utils_test.py b/tensorflow/python/saved_model/utils_test.py
index c9b38ed6032..0888dcb411e 100644
--- a/tensorflow/python/saved_model/utils_test.py
+++ b/tensorflow/python/saved_model/utils_test.py
@@ -19,16 +19,41 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.core.framework import types_pb2
+from tensorflow.python.eager import function
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.platform import test
 from tensorflow.python.saved_model import utils
 
 
 class UtilsTest(test.TestCase):
 
+  def testBuildTensorInfoOp(self):
+    x = constant_op.constant(1, name="x")
+    y = constant_op.constant(2, name="y")
+    z = control_flow_ops.group([x, y], name="op_z")
+    z_op_info = utils.build_tensor_info_from_op(z)
+    self.assertEqual("op_z", z_op_info.name)
+    self.assertEqual(types_pb2.DT_INVALID, z_op_info.dtype)
+    self.assertEqual(0, len(z_op_info.tensor_shape.dim))
+
+  def testBuildTensorInfoDefunOp(self):
+    @function.defun
+    def my_init_fn(x, y):
+      self.x_var = x
+      self.y_var = y
+
+    x = constant_op.constant(1, name="x")
+    y = constant_op.constant(2, name="y")
+    init_op_info = utils.build_tensor_info_from_op(my_init_fn(x, y))
+    self.assertEqual("PartitionedFunctionCall", init_op_info.name)
+    self.assertEqual(types_pb2.DT_INVALID, init_op_info.dtype)
+    self.assertEqual(0, len(init_op_info.tensor_shape.dim))
+
   def testBuildTensorInfoDense(self):
     x = array_ops.placeholder(dtypes.float32, 1, name="x")
     x_tensor_info = utils.build_tensor_info(x)
diff --git a/tensorflow/python/summary/summary.py b/tensorflow/python/summary/summary.py
index fbae2b77faf..9e9e6ed9035 100644
--- a/tensorflow/python/summary/summary.py
+++ b/tensorflow/python/summary/summary.py
@@ -28,12 +28,12 @@ from google.protobuf import json_format as _json_format
 # pylint: disable=unused-import
 from tensorflow.core.framework.summary_pb2 import Summary
 from tensorflow.core.framework.summary_pb2 import SummaryDescription
+from tensorflow.core.framework.summary_pb2 import SummaryMetadata as _SummaryMetadata  # pylint: enable=unused-import
 from tensorflow.core.util.event_pb2 import Event
 from tensorflow.core.util.event_pb2 import SessionLog
 from tensorflow.core.util.event_pb2 import TaggedRunMetadata
 # pylint: enable=unused-import
 
-
 from tensorflow.python.eager import context as _context
 from tensorflow.python.framework import constant_op as _constant_op
 from tensorflow.python.framework import dtypes as _dtypes
@@ -42,16 +42,6 @@ from tensorflow.python.ops import gen_logging_ops as _gen_logging_ops
 from tensorflow.python.ops import gen_summary_ops as _gen_summary_ops  # pylint: disable=unused-import
 from tensorflow.python.ops import summary_op_util as _summary_op_util
 
-# exports tensor-related summaries
-# pylint: disable=unused-import
-from tensorflow.python.ops.summary_ops import tensor_summary
-# pylint: enable=unused-import
-
-# exports text
-# pylint: disable=unused-import
-from tensorflow.python.summary.text_summary import text_summary as text
-# pylint: enable=unused-import
-
 # exports FileWriter, FileWriterCache
 # pylint: disable=unused-import
 from tensorflow.python.summary.writer.writer import FileWriter
@@ -238,6 +228,103 @@ def audio(name, tensor, sample_rate, max_outputs=3, collections=None,
   return val
 
 
+@tf_export('summary.text')
+def text(name, tensor, collections=None):
+  """Summarizes textual data.
+
+  Text data summarized via this plugin will be visible in the Text Dashboard
+  in TensorBoard. The standard TensorBoard Text Dashboard will render markdown
+  in the strings, and will automatically organize 1d and 2d tensors into tables.
+  If a tensor with more than 2 dimensions is provided, a 2d subarray will be
+  displayed along with a warning message. (Note that this behavior is not
+  intrinsic to the text summary api, but rather to the default TensorBoard text
+  plugin.)
+
+  Args:
+    name: A name for the generated node. Will also serve as a series name in
+      TensorBoard.
+    tensor: a string-type Tensor to summarize.
+    collections: Optional list of ops.GraphKeys.  The collections to add the
+      summary to.  Defaults to [_ops.GraphKeys.SUMMARIES]
+
+  Returns:
+    A TensorSummary op that is configured so that TensorBoard will recognize
+    that it contains textual data. The TensorSummary is a scalar `Tensor` of
+    type `string` which contains `Summary` protobufs.
+
+  Raises:
+    ValueError: If tensor has the wrong type.
+  """
+  if tensor.dtype != _dtypes.string:
+    raise ValueError('Expected tensor %s to have dtype string, got %s' %
+                     (tensor.name, tensor.dtype))
+
+  summary_metadata = _SummaryMetadata(
+      plugin_data=_SummaryMetadata.PluginData(plugin_name='text'))
+  t_summary = tensor_summary(
+      name=name,
+      tensor=tensor,
+      summary_metadata=summary_metadata,
+      collections=collections)
+  return t_summary
+
+
+@tf_export('summary.tensor_summary')
+def tensor_summary(name,
+                   tensor,
+                   summary_description=None,
+                   collections=None,
+                   summary_metadata=None,
+                   family=None,
+                   display_name=None):
+  """Outputs a `Summary` protocol buffer with a serialized tensor.proto.
+
+  Args:
+    name: A name for the generated node. If display_name is not set, it will
+      also serve as the tag name in TensorBoard. (In that case, the tag
+      name will inherit tf name scopes.)
+    tensor: A tensor of any type and shape to serialize.
+    summary_description: A long description of the summary sequence. Markdown
+      is supported.
+    collections: Optional list of graph collections keys. The new summary op is
+      added to these collections. Defaults to `[GraphKeys.SUMMARIES]`.
+    summary_metadata: Optional SummaryMetadata proto (which describes which
+      plugins may use the summary value).
+    family: Optional; if provided, used as the prefix of the summary tag,
+      which controls the name used for display on TensorBoard when
+      display_name is not set.
+    display_name: A string used to name this data in TensorBoard. If this is
+      not set, then the node name will be used instead.
+
+  Returns:
+    A scalar `Tensor` of type `string`. The serialized `Summary` protocol
+    buffer.
+  """
+
+  if summary_metadata is None:
+    summary_metadata = _SummaryMetadata()
+
+  if summary_description is not None:
+    summary_metadata.summary_description = summary_description
+
+  if display_name is not None:
+    summary_metadata.display_name = display_name
+
+  serialized_summary_metadata = summary_metadata.SerializeToString()
+
+  if _summary_op_util.skip_summary():
+    return _constant_op.constant('')
+  with _summary_op_util.summary_scope(
+      name, family, values=[tensor]) as (tag, scope):
+    val = _gen_logging_ops.tensor_summary_v2(
+        tensor=tensor,
+        tag=tag,
+        name=scope,
+        serialized_summary_metadata=serialized_summary_metadata)
+    _summary_op_util.collect(val, collections, [_ops.GraphKeys.SUMMARIES])
+  return val
+
+
 @tf_export('summary.merge')
 def merge(inputs, collections=None, name=None):
   # pylint: disable=line-too-long
diff --git a/tensorflow/python/summary/summary_test.py b/tensorflow/python/summary/summary_test.py
index ac5eb4dbbe3..cacc28cc596 100644
--- a/tensorflow/python/summary/summary_test.py
+++ b/tensorflow/python/summary/summary_test.py
@@ -12,6 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
+"""Tests for the API surface of the V1 tf.summary ops.
+
+These tests don't check the actual serialized proto summary value for the
+more complex summaries (e.g. audio, image).  Those test live separately in
+tensorflow/python/kernel_tests/summary_v1_*.py.
+"""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -21,6 +27,7 @@ from six.moves import xrange  # pylint: disable=redefined-builtin
 
 from tensorflow.core.framework import summary_pb2
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import meta_graph
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
@@ -29,7 +36,7 @@ from tensorflow.python.platform import test
 from tensorflow.python.summary import summary as summary_lib
 
 
-class ScalarSummaryTest(test.TestCase):
+class SummaryTest(test.TestCase):
 
   def testScalarSummary(self):
     with self.cached_session() as s:
@@ -135,6 +142,12 @@ class ScalarSummaryTest(test.TestCase):
     self.assertEqual(len(summary.value), 1)
     self.assertEqual(summary.value[0].tag, 'family/outer/family/inner')
 
+  def testHistogramSummaryTypes(self):
+    for dtype in (dtypes.int8, dtypes.uint8, dtypes.int16, dtypes.int32,
+                  dtypes.float32, dtypes.float64):
+      const = constant_op.constant(10, dtype=dtype)
+      summary_lib.histogram('h', const)
+
   def testAudioSummary(self):
     with self.cached_session() as s:
       i = array_ops.ones((5, 3, 4))
@@ -165,6 +178,21 @@ class ScalarSummaryTest(test.TestCase):
                       for i in xrange(3))
     self.assertEqual(tags, expected)
 
+  def testTextSummary(self):
+    with self.cached_session():
+      with self.assertRaises(ValueError):
+        num = array_ops.constant(1)
+        summary_lib.text('foo', num)
+
+      # The API accepts vectors.
+      arr = array_ops.constant(['one', 'two', 'three'])
+      summ = summary_lib.text('foo', arr)
+      self.assertEqual(summ.op.type, 'TensorSummaryV2')
+
+      # the API accepts scalars
+      summ = summary_lib.text('foo', array_ops.constant('one'))
+      self.assertEqual(summ.op.type, 'TensorSummaryV2')
+
   def testSummaryNameConversion(self):
     c = constant_op.constant(3)
     s = summary_lib.scalar('name with spaces', c)
diff --git a/tensorflow/python/summary/text_summary.py b/tensorflow/python/summary/text_summary.py
deleted file mode 100644
index 6418c847f3c..00000000000
--- a/tensorflow/python/summary/text_summary.py
+++ /dev/null
@@ -1,73 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Implements text_summary in TensorFlow, with TensorBoard support.
-
-The text_summary is a wrapper around the generic tensor_summary that takes a
-string-type tensor and emits a TensorSummary op with SummaryMetadata that
-notes that this summary is textual data for the TensorBoard text plugin.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.core.framework import summary_pb2
-from tensorflow.python.framework import dtypes
-from tensorflow.python.ops.summary_ops import tensor_summary
-from tensorflow.python.util.tf_export import tf_export
-
-PLUGIN_NAME = "text"
-
-
-@tf_export("summary.text")
-def text_summary(name, tensor, collections=None):
-  """Summarizes textual data.
-
-  Text data summarized via this plugin will be visible in the Text Dashboard
-  in TensorBoard. The standard TensorBoard Text Dashboard will render markdown
-  in the strings, and will automatically organize 1d and 2d tensors into tables.
-  If a tensor with more than 2 dimensions is provided, a 2d subarray will be
-  displayed along with a warning message. (Note that this behavior is not
-  intrinsic to the text summary api, but rather to the default TensorBoard text
-  plugin.)
-
-  Args:
-    name: A name for the generated node. Will also serve as a series name in
-      TensorBoard.
-    tensor: a string-type Tensor to summarize.
-    collections: Optional list of ops.GraphKeys.  The collections to add the
-      summary to.  Defaults to [_ops.GraphKeys.SUMMARIES]
-
-  Returns:
-    A TensorSummary op that is configured so that TensorBoard will recognize
-    that it contains textual data. The TensorSummary is a scalar `Tensor` of
-    type `string` which contains `Summary` protobufs.
-
-  Raises:
-    ValueError: If tensor has the wrong type.
-  """
-  if tensor.dtype != dtypes.string:
-    raise ValueError("Expected tensor %s to have dtype string, got %s" %
-                     (tensor.name, tensor.dtype))
-
-  summary_metadata = summary_pb2.SummaryMetadata(
-      plugin_data=summary_pb2.SummaryMetadata.PluginData(
-          plugin_name=PLUGIN_NAME))
-  t_summary = tensor_summary(
-      name=name,
-      tensor=tensor,
-      summary_metadata=summary_metadata,
-      collections=collections)
-  return t_summary
diff --git a/tensorflow/python/summary/text_summary_test.py b/tensorflow/python/summary/text_summary_test.py
deleted file mode 100644
index 5b0db43cc1c..00000000000
--- a/tensorflow/python/summary/text_summary_test.py
+++ /dev/null
@@ -1,53 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.framework import test_util
-from tensorflow.python.ops import array_ops
-from tensorflow.python.platform import googletest
-from tensorflow.python.summary import text_summary
-
-
-class TextPluginTest(test_util.TensorFlowTestCase):
-  """Test the Text Summary API.
-
-  These tests are focused on testing the API design of the text_summary method.
-  It doesn't test the PluginAsset and tensors registry functionality, because
-  that is better tested by the text_plugin test that actually consumes that
-  metadata.
-  """
-
-  def testTextSummaryAPI(self):
-    with self.cached_session():
-
-      with self.assertRaises(ValueError):
-        num = array_ops.constant(1)
-        text_summary.text_summary("foo", num)
-
-      # The API accepts vectors.
-      arr = array_ops.constant(["one", "two", "three"])
-      summ = text_summary.text_summary("foo", arr)
-      self.assertEqual(summ.op.type, "TensorSummaryV2")
-
-      # the API accepts scalars
-      summ = text_summary.text_summary("foo", array_ops.constant("one"))
-      self.assertEqual(summ.op.type, "TensorSummaryV2")
-
-
-if __name__ == "__main__":
-  googletest.main()
diff --git a/tensorflow/python/tensorflow.i b/tensorflow/python/tensorflow.i
index 39174fa5890..4e1bf3d8362 100644
--- a/tensorflow/python/tensorflow.i
+++ b/tensorflow/python/tensorflow.i
@@ -38,7 +38,6 @@ limitations under the License.
 
 %include "tensorflow/python/lib/io/file_io.i"
 %include "tensorflow/python/training/quantize_training.i"
-%include "tensorflow/python/training/server_lib.i"
 
 %include "tensorflow/python/framework/python_op_gen.i"
 
diff --git a/tensorflow/python/tools/api/generator/api_init_files.bzl b/tensorflow/python/tools/api/generator/api_init_files.bzl
index f7de02ed6fa..5699d86e6d5 100644
--- a/tensorflow/python/tools/api/generator/api_init_files.bzl
+++ b/tensorflow/python/tools/api/generator/api_init_files.bzl
@@ -10,9 +10,9 @@ TENSORFLOW_API_INIT_FILES = [
     "data/__init__.py",
     "data/experimental/__init__.py",
     "debugging/__init__.py",
-    "distributions/__init__.py",
     "dtypes/__init__.py",
     "errors/__init__.py",
+    "experimental/__init__.py",
     "feature_column/__init__.py",
     "gfile/__init__.py",
     "graph_util/__init__.py",
@@ -71,17 +71,16 @@ TENSORFLOW_API_INIT_FILES = [
     "profiler/__init__.py",
     "quantization/__init__.py",
     "random/__init__.py",
-    "resource_loader/__init__.py",
-    "strings/__init__.py",
     "saved_model/__init__.py",
     "sets/__init__.py",
+    "signal/__init__.py",
     "sparse/__init__.py",
     "spectral/__init__.py",
+    "strings/__init__.py",
     "summary/__init__.py",
     "sysconfig/__init__.py",
     "test/__init__.py",
     "train/__init__.py",
-    "user_ops/__init__.py",
     "version/__init__.py",
     # END GENERATED FILES
 ]
diff --git a/tensorflow/python/tools/api/generator/api_init_files_v1.bzl b/tensorflow/python/tools/api/generator/api_init_files_v1.bzl
index 520ce54a881..89c817f6090 100644
--- a/tensorflow/python/tools/api/generator/api_init_files_v1.bzl
+++ b/tensorflow/python/tools/api/generator/api_init_files_v1.bzl
@@ -13,6 +13,7 @@ TENSORFLOW_API_INIT_FILES_V1 = [
     "distributions/__init__.py",
     "dtypes/__init__.py",
     "errors/__init__.py",
+    "experimental/__init__.py",
     "feature_column/__init__.py",
     "gfile/__init__.py",
     "graph_util/__init__.py",
@@ -80,6 +81,7 @@ TENSORFLOW_API_INIT_FILES_V1 = [
     "saved_model/__init__.py",
     "saved_model/builder/__init__.py",
     "saved_model/constants/__init__.py",
+    "saved_model/experimental/__init__.py",
     "saved_model/loader/__init__.py",
     "saved_model/main_op/__init__.py",
     "saved_model/signature_constants/__init__.py",
@@ -87,6 +89,7 @@ TENSORFLOW_API_INIT_FILES_V1 = [
     "saved_model/tag_constants/__init__.py",
     "saved_model/utils/__init__.py",
     "sets/__init__.py",
+    "signal/__init__.py",
     "sparse/__init__.py",
     "spectral/__init__.py",
     "summary/__init__.py",
diff --git a/tensorflow/python/tools/api/generator/doc_srcs.py b/tensorflow/python/tools/api/generator/doc_srcs.py
index fbec9c6635c..479d5006d1e 100644
--- a/tensorflow/python/tools/api/generator/doc_srcs.py
+++ b/tensorflow/python/tools/api/generator/doc_srcs.py
@@ -56,6 +56,7 @@ _TENSORFLOW_DOC_SOURCES = {
     'resource_loader': DocSource(
         docstring_module_name='platform.resource_loader'),
     'sets': DocSource(docstring_module_name='ops.sets'),
+    'signal': DocSource(docstring_module_name='ops.signal'),
     'sparse': DocSource(docstring_module_name='ops.sparse_ops'),
     'spectral': DocSource(docstring_module_name='ops.spectral_ops'),
     'strings': DocSource(docstring_module_name='ops.string_ops'),
diff --git a/tensorflow/python/training/basic_loops.py b/tensorflow/python/training/basic_loops.py
index 7af821c8192..68fcb97a1c3 100644
--- a/tensorflow/python/training/basic_loops.py
+++ b/tensorflow/python/training/basic_loops.py
@@ -21,7 +21,7 @@ from tensorflow.python.framework import errors
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export("train.basic_train_loop")
+@tf_export(v1=["train.basic_train_loop"])
 def basic_train_loop(supervisor, train_step_fn, args=None,
                      kwargs=None, master=""):
   """Basic loop to train a model.
diff --git a/tensorflow/python/training/checkpoint_management.py b/tensorflow/python/training/checkpoint_management.py
index 38910fb246d..f745ab4824a 100644
--- a/tensorflow/python/training/checkpoint_management.py
+++ b/tensorflow/python/training/checkpoint_management.py
@@ -36,6 +36,7 @@ from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import training_util
 from tensorflow.python.training.checkpoint_state_pb2 import CheckpointState
 from tensorflow.python.util import compat
+from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -55,7 +56,11 @@ def _GetCheckpointFilename(save_dir, latest_filename):
   return os.path.join(save_dir, latest_filename)
 
 
-@tf_export("train.generate_checkpoint_state_proto")
+@deprecation.deprecated(
+    date=None,
+    instructions=("Use tf.train.CheckpointManager to manage checkpoints rather "
+                  "than editing the Checkpoint proto manually."))
+@tf_export(v1=["train.generate_checkpoint_state_proto"])
 def generate_checkpoint_state_proto(save_dir,
                                     model_checkpoint_path,
                                     all_model_checkpoint_paths=None,
@@ -121,7 +126,11 @@ def generate_checkpoint_state_proto(save_dir,
   return coord_checkpoint_proto
 
 
-@tf_export("train.update_checkpoint_state")
+@deprecation.deprecated(
+    date=None,
+    instructions=("Use tf.train.CheckpointManager to manage checkpoints rather "
+                  "than manually editing the Checkpoint proto."))
+@tf_export(v1=["train.update_checkpoint_state"])
 def update_checkpoint_state(save_dir,
                             model_checkpoint_path,
                             all_model_checkpoint_paths=None,
@@ -344,7 +353,10 @@ def latest_checkpoint(checkpoint_dir, latest_filename=None):
   return None
 
 
-@tf_export("train.checkpoint_exists")
+@deprecation.deprecated(
+    date=None,
+    instructions="Use standard file APIs to check for files with this prefix.")
+@tf_export(v1=["train.checkpoint_exists"])
 def checkpoint_exists(checkpoint_prefix):
   """Checks whether a V1 or V2 checkpoint exists with the specified prefix.
 
@@ -369,7 +381,10 @@ def checkpoint_exists(checkpoint_prefix):
     return False
 
 
-@tf_export("train.get_checkpoint_mtimes")
+@deprecation.deprecated(
+    date=None,
+    instructions="Use standard file utilities to get mtimes.")
+@tf_export(v1=["train.get_checkpoint_mtimes"])
 def get_checkpoint_mtimes(checkpoint_prefixes):
   """Returns the mtimes (modification timestamps) of the checkpoints.
 
@@ -408,7 +423,10 @@ def get_checkpoint_mtimes(checkpoint_prefixes):
   return mtimes
 
 
-@tf_export("train.remove_checkpoint")
+@deprecation.deprecated(
+    date=None,
+    instructions="Use standard file APIs to delete files with this prefix.")
+@tf_export(v1=["train.remove_checkpoint"])
 def remove_checkpoint(checkpoint_prefix,
                       checkpoint_format_version=saver_pb2.SaverDef.V2,
                       meta_graph_suffix="meta"):
@@ -458,6 +476,7 @@ def meta_graph_filename(checkpoint_filename, meta_graph_suffix="meta"):
 
 
 # TODO(allenl): Allow tf.keras.Model instances in the constructor directly?
+@tf_export("train.CheckpointManager")
 class CheckpointManager(object):
   """Deletes old checkpoints.
 
@@ -634,13 +653,10 @@ class CheckpointManager(object):
     """
     return self._checkpoint_prefix
 
-  def save(self, session=None, checkpoint_number=None):
+  def save(self, checkpoint_number=None):
     """Creates a new checkpoint and manages it.
 
     Args:
-      session: The session to evaluate variables in. Ignored when executing
-        eagerly. If not provided when graph building, the default session is
-        used.
       checkpoint_number: An optional integer, or an integer-dtype `Variable` or
         `Tensor`, used to number the checkpoint. If `None` (default),
         checkpoints are numbered using `checkpoint.save_counter`. Even if
@@ -657,9 +673,9 @@ class CheckpointManager(object):
     if context.executing_eagerly():
       save_counter = self._checkpoint.save_counter
       save_counter.assign_add(1)
+      session = None
     else:
-      if session is None:
-        session = ops.get_default_session()
+      session = ops.get_default_session()
 
       def _initializing_creator(next_creator, **kwargs):
         """Initialize the save counter if it has been newly created."""
diff --git a/tensorflow/python/training/checkpoint_utils.py b/tensorflow/python/training/checkpoint_utils.py
index 57954ec56a5..857da431db2 100644
--- a/tensorflow/python/training/checkpoint_utils.py
+++ b/tensorflow/python/training/checkpoint_utils.py
@@ -318,13 +318,13 @@ def _set_checkpoint_initializer(variable,
         saveable_objects.append(s)
 
     assert len(saveable_objects) == 1  # Should be only one variable.
-    init_op = saveable_objects[0].restore([restore_op], restored_shapes=None)
+  init_op = saveable_objects[0].restore([restore_op], restored_shapes=None)
 
-    # pylint:disable=protected-access
-    variable._initializer_op = init_op
-    restore_op.set_shape(variable.shape)
-    variable._initial_value = restore_op
-    # pylint:enable=protected-access
+  # pylint:disable=protected-access
+  variable._initializer_op = init_op
+  restore_op.set_shape(variable.shape)
+  variable._initial_value = restore_op
+  # pylint:enable=protected-access
 
 
 def _set_variable_or_list_initializer(variable_or_list, ckpt_file,
diff --git a/tensorflow/python/training/checkpoint_utils_test.py b/tensorflow/python/training/checkpoint_utils_test.py
index 61dcbdb2b8f..a3e58de4a31 100644
--- a/tensorflow/python/training/checkpoint_utils_test.py
+++ b/tensorflow/python/training/checkpoint_utils_test.py
@@ -207,9 +207,6 @@ class CheckpointsTest(test.TestCase):
 
       checkpoint_utils.init_from_checkpoint(checkpoint_dir,
                                             {"useful_scope/": "useful_scope/"})
-      # initializer runs on the same task but always on CPU.
-      self.assertEqual(my4._initializer_op.op.inputs[1].device,
-                       "/job:ps/device:CPU:0")
 
   def testInitFromRootCheckpoint(self):
     checkpoint_dir = self.get_temp_dir()
diff --git a/tensorflow/python/training/checkpointable/util.py b/tensorflow/python/training/checkpointable/util.py
index 137d29b7859..f45f7445f13 100644
--- a/tensorflow/python/training/checkpointable/util.py
+++ b/tensorflow/python/training/checkpointable/util.py
@@ -821,7 +821,7 @@ def capture_dependencies(template):
     """
     def _call_next_creator_renaming_initializer(initializer, **inner_kwargs):
       inner_kwargs.pop("name")  # Ignored; this is the scope-stripped name which
-                                # we don't want to propagate.
+      # we don't want to propagate.
       return next_creator(
           initial_value=initializer,
           name=name,
@@ -982,6 +982,12 @@ class CheckpointLoadStatus(_LoadStatus):
         raise AssertionError(
             "Object not assigned a value from checkpoint: %s" % (node,))
     for checkpointable_object in list_objects(self._root_checkpointable):
+      # Remove data structures that do not contain any variables from
+      # restoration checks.
+      if (isinstance(checkpointable_object,
+                     data_structures.CheckpointableDataStructure) and
+          not checkpointable_object._checkpoint_dependencies):
+        continue
       self._checkpoint.all_python_objects.add(checkpointable_object)
     unused_python_objects = (
         _ObjectIdentitySet(self._checkpoint.all_python_objects)
diff --git a/tensorflow/python/training/checkpointable/util_test.py b/tensorflow/python/training/checkpointable/util_test.py
index 24fd42f6d2e..19955140123 100644
--- a/tensorflow/python/training/checkpointable/util_test.py
+++ b/tensorflow/python/training/checkpointable/util_test.py
@@ -1313,6 +1313,24 @@ class CheckpointingTests(test.TestCase):
       train_fn()
       self.assertEqual(42., self.evaluate(optimizer.variables()[0]))
 
+  @test_util.run_in_graph_and_eager_modes
+  def test_restore_after_adding_empty_checkpointable_data_structure(self):
+    model = NonLayerCheckpointable()
+    checkpoint = checkpointable_utils.Checkpoint(model=model)
+    checkpoint.restore(None).initialize_or_restore()
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    save_path = checkpoint.save(checkpoint_prefix)
+
+    del model, checkpoint
+
+    model = NonLayerCheckpointable()
+    model.dict = {"a": 1}
+    model.list = {"b": 1}
+    checkpoint = checkpointable_utils.Checkpoint(model=model)
+    load_status = checkpoint.restore(save_path)
+    load_status.assert_existing_objects_matched().run_restore_ops()
+
 
 class _ManualScope(tracking.Checkpointable):
 
diff --git a/tensorflow/python/training/distribute.py b/tensorflow/python/training/distribute.py
index 35ed52fa129..95104ad5779 100644
--- a/tensorflow/python/training/distribute.py
+++ b/tensorflow/python/training/distribute.py
@@ -21,7 +21,6 @@ from __future__ import print_function
 import threading
 
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.eager import context as eager_context
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
@@ -88,8 +87,7 @@ def _require_cross_replica_context(distribution_strategy):
   if context.cross_replica_context is distribution_strategy: return
   # We have an error to report, figure out the right message.
   if context.distribution_strategy is not distribution_strategy:
-    if (context.distribution_strategy is
-        distribution_strategy_context._get_default_distribution_strategy()):  # pylint: disable=protected-access
+    if not distribution_strategy_context.has_distribution_strategy():
       raise RuntimeError(
           'Need to be inside "with distribution_strategy.scope()" for %s' %
           (distribution_strategy,))
@@ -122,8 +120,7 @@ def _require_distribution_strategy_scope(distribution_strategy):
   context = _get_per_thread_mode()
   if context.distribution_strategy is distribution_strategy: return
   # We have an error to report, figure out the right message.
-  if (context.distribution_strategy is
-      distribution_strategy_context._get_default_distribution_strategy()):  # pylint: disable=protected-access
+  if not distribution_strategy_context.has_distribution_strategy():
     raise RuntimeError(
         'Need to be inside "with distribution_strategy.scope()" for %s' %
         (distribution_strategy,))
@@ -265,7 +262,7 @@ class DistributionStrategy(object):
       iterator = my_distribution.distribute_dataset(
           dataset).make_one_shot_iterator()
       replica_train_ops = my_distribution.call_for_each_replica(
-          replica_fn, iterator.get_next())
+          replica_fn, args=(iterator.get_next(),))
       train_op = tf.group(my_distribution.unwrap(replica_train_ops))
     ```
 
@@ -293,11 +290,11 @@ class DistributionStrategy(object):
 
   * Wrapped values: In order to represent values parallel across devices
     (either replicas or the devices associated with a particular value), we
-    wrap them in a "PerDevice" or "Mirrored" object that contains a map
-    from device to values. "PerDevice" is used when the value may be
-    different across devices, and "Mirrored" when the value are the same.
+    wrap them in a "PerReplica" or "Mirrored" object that contains a map
+    from device to values. "PerReplica" is used when the value may be
+    different across replicas, and "Mirrored" when the value are the same.
   * Unwrapping and merging: Consider calling a function `fn` on
-    multiple devices, like `call_for_each_replica(fn, w)` with an
+    multiple replicas, like `call_for_each_replica(fn, args=[w])` with an
     argument `w` that is a wrapped value. This means `w` will have a
     map taking replica device `d0` to `w0`, replica device `d1` to `w1`,
     etc. `call_for_each_replica()` unwraps `w` before calling `fn`, so
@@ -340,7 +337,7 @@ class DistributionStrategy(object):
   called _locality_ that says what values are compatible with which
   APIs:
 
-  * T: different value for each replica (e.g. a PerDevice-wrapped value).
+  * T: different value for each replica (e.g. a PerReplica-wrapped value).
   * M: value is "mirrored" across replicas, i.e. there are copies with the
     same value on each replica (e.g. a Mirrored-wrapped value).
   * V(`v`): value is "mirrored" across all the devices which have a
@@ -418,9 +415,7 @@ class DistributionStrategy(object):
 
   You may use this API whether or not a `DistributionStrategy` is
   being used, since there is a default implementation of
-  `ReplicaContext` and `DistributionStrategy`. Or you can use the
-  `get_replica_context().is_single_replica` property to run different code
-  in the distributed vs. single replica cases.
+  `ReplicaContext` and `DistributionStrategy`.
   """
 
   # TODO(josh11b): Raise an exception if variable partitioning requested before
@@ -458,19 +453,22 @@ class DistributionStrategy(object):
       kwargs["use_resource"] = True
       return self._create_variable(*args, **kwargs)
 
-    def disable_partitioned_variables(getter, *args, **kwargs):
-      if kwargs.pop("partitioner", None) is not None:
-        tf_logging.log_first_n(
-            tf_logging.WARN, "Partitioned variables are disabled when using "
-            "DistributionStrategy.", 1)
+    def distributed_getter(getter, *args, **kwargs):
+      if not self._allow_variable_partition():
+        if kwargs.pop("partitioner", None) is not None:
+          tf_logging.log_first_n(
+              tf_logging.WARN, "Partitioned variables are disabled when using "
+              "current DistributionStrategy.", 1)
       return getter(*args, **kwargs)
 
     return _CurrentDistributionContext(
         self, variable_scope.variable_creator_scope(creator_with_resource_vars),
         variable_scope.variable_scope(
             variable_scope.get_variable_scope(),
-            custom_getter=disable_partitioned_variables),
-        self._default_device)
+            custom_getter=distributed_getter), self._default_device)
+
+  def _allow_variable_partition(self):
+    return False
 
   def _create_variable(self, next_creator, *args, **kwargs):
     # Note: should support "colocate_with" argument.
@@ -543,7 +541,7 @@ class DistributionStrategy(object):
           "DistributionStrategy.")
     return result
 
-  # TODO(josh11b): `PerDeviceDataset` currently only implements a few methods of
+  # TODO(josh11b): `PerReplicaDataset` currently only implements a few methods of
   # Dataset API such as make_one_shot_iterator and make_initializable_iterator.
   # Extend to implement more functionality of datasets.
   def distribute_dataset(self, dataset_fn):
@@ -559,14 +557,14 @@ class DistributionStrategy(object):
       distributed_dataset = distribution_strategy.distribute_dataset(dataset_fn)
       iterator = distributed_dataset.make_one_shot_iterator()
       replica_results = distribution_strategy.call_for_each_replica(
-          replica_fn, iterator.get_next())
+          replica_fn, args=(iterator.get_next(),))
     ```
 
     Args:
       dataset_fn: A function that returns a `tf.data.Dataset`.
 
     Returns:
-      A `PerDeviceDataset` that will produce data for each replica.
+      A `PerReplicaDataset` that will produce data for each replica.
     """
     raise NotImplementedError("must be implemented in descendants")
 
@@ -598,13 +596,9 @@ class DistributionStrategy(object):
     For example, TPU initialize_system ops.
 
     Returns:
-      In eager mode, returns `None`.
-      In graph mode, a list of ops to execute. Empty list if nothing to be done.
+      A list of ops to execute.
     """
-    if eager_context.executing_eagerly():
-      return
-    else:
-      return []
+    return []
 
   def finalize(self):
     """Any final actions to be done at the end of all computations.
@@ -615,13 +609,9 @@ class DistributionStrategy(object):
     For example, TPU shutdown ops.
 
     Returns:
-      In eager mode, returns `None`.
-      In graph mode, a list of ops to execute. Empty list if nothing to be done.
+      A list of ops to execute.
     """
-    if eager_context.executing_eagerly():
-      return
-    else:
-      return []
+    return []
 
   def run_steps_on_dataset(self, fn, iterator, iterations=1,
                            initial_loop_values=None):
@@ -697,83 +687,44 @@ class DistributionStrategy(object):
       replica_ctx = tf.get_replica_context()
       v = three + replica_ctx.replica_id
       # Computes the sum of the `v` values across all replicas.
-      s = replica_ctx.merge_call(merge_fn, v)
+      s = replica_ctx.merge_call(merge_fn, args=(v,))
       return s + v
 
     with distribution.scope():
       # in "cross-replica" context
       ...
-      merged_results = distribution.call_for_each_replica(fn, 3)
+      merged_results = distribution.call_for_each_replica(fn, args=[3])
       # merged_results has the values from every replica execution of `fn`.
       print(distribution.unwrap(merged_results))  # Prints a list
     ```
 
     Args:
       fn: function to run (will be run once per replica).
-      *args: positional arguments for `fn`
-      **kwargs: keyword arguments for `fn`.
-          `"run_concurrently"`: Boolean indicating whether executions of `fn`
-             can be run concurrently (under eager execution only), defaults to
-             `True`.
+      args: Tuple or list with positional arguments for `fn`.
+      kwargs: Dict with keyword arguments for `fn`.
 
     Returns:
       Merged return value of `fn` across all replicas.
     """
     _require_cross_replica_context(self)
-    return self._call_for_each_replica(fn, *args, **kwargs)
+    # Handle old *args, **kwargs, and new args=(...), kwargs={...}, to
+    # allow transition.
+    a = kwargs.pop("args", None)
+    if a is not None:
+      if args:
+        raise ValueError(
+            "Can't pass *args and args=... to call_for_each_replica")
+      args = a
+    k = kwargs.pop("kwargs", None)
+    if k is not None:
+      if kwargs:
+        raise ValueError(
+            "Can't pass **kwargs and kwargs=... to call_for_each_replica")
+      kwargs = k
+    kwargs.pop("run_concurrently", None)  # Ignore old option.
+    return self._call_for_each_replica(fn, args, kwargs)
 
-  def call_for_each_tower(self, fn, *args, **kwargs):
-    """Run `fn` once per replica. DEPRECATED.
-
-    DEPRECATED: Use `call_for_each_replica` instead.
-
-    `fn` may call `tf.get_replica_context()` to access methods such as
-    `replica_id()` and `merge_call()`.
-
-    `merge_call()` is used to communicate between the replicas and
-    re-enter the cross-replica context. All replicas pause their execution
-    having encountered a `merge_call()` call. After that the
-    `merge_fn`-function is executed. Its results are then unwrapped and
-    given back to each replica call. After that execution resumes until
-    `fn` is complete or encounters another `merge_call()`.  Example:
-
-    ```python
-    # Called once in "cross-replica" context.
-    def merge_fn(distribution, three_plus_replica_id):
-      # sum the values across replicas
-      return sum(distribution.unwrap(three_plus_replica_id))
-
-    # Called once per replica in `distribution`, in a "replica" context.
-    def fn(three):
-      replica_ctx = tf.get_replica_context()
-      v = three + replica_ctx.replica_id
-      # Computes the sum of the `v` values across all replicas.
-      s = replica_ctx.merge_call(merge_fn, v)
-      return s + v
-
-    with distribution.scope():
-      # in "cross-replica" context
-      ...
-      merged_results = distribution.call_for_each_replica(fn, 3)
-      # merged_results has the values from every replica execution of `fn`.
-      print(distribution.unwrap(merged_results))  # Prints a list
-    ```
-
-    Args:
-      fn: function to run (will be run once per replica).
-      *args: positional arguments for `fn`
-      **kwargs: keyword arguments for `fn`.
-          `"run_concurrently"`: Boolean indicating whether executions of `fn`
-             can be run concurrently (under eager execution only), defaults to
-             `True`.
-
-    Returns:
-      Merged return value of `fn` across all replicas.
-    """
-    _require_cross_replica_context(self)
-    return self._call_for_each_replica(fn, *args, **kwargs)
-
-  def _call_for_each_replica(self, fn, *args, **kwargs):
+  def _call_for_each_replica(self, fn, args, kwargs):
     raise NotImplementedError("must be implemented in descendants")
 
   def reduce(self, aggregation, value, destinations):
@@ -783,8 +734,8 @@ class DistributionStrategy(object):
       aggregation: Indicates how a variable will be aggregated. Accepted values
         are `tf.VariableAggregation.SUM`, `tf.VariableAggregation.MEAN`,
         `tf.VariableAggregation.ONLY_FIRST_REPLICA`.
-      value: A per-device value with one value per replica.
-      destinations: A mirrored variable, a per-device tensor, a device string,
+      value: A per-replica value with one value per replica.
+      destinations: A mirrored variable, a per-replica tensor, a device string,
         or list of device strings. The return value will be copied to all
         destination devices (or all the devices where the `destinations` value
         resides). To perform an all-reduction, pass `value` to `destinations`.
@@ -851,7 +802,7 @@ class DistributionStrategy(object):
 
     Otherwise this returns `fn(var, *args, **kwargs)` colocated with `var`.
 
-    Neither `*args` nor `**kwargs` may contain per-device values.
+    Neither `*args` nor `**kwargs` may contain per-replica values.
     If they contain mirrored values, they will be unwrapped before
     calling `fn`.
 
@@ -899,7 +850,7 @@ class DistributionStrategy(object):
     raise NotImplementedError("must be implemented in descendants")
 
   def unwrap(self, value):
-    """Returns the list of all per-device values contained in `value`.
+    """Returns the list of all per-replica values contained in `value`.
 
     Args:
       value: A value returned by `call_for_each_replica()` or a variable
@@ -912,7 +863,7 @@ class DistributionStrategy(object):
     return self._unwrap(value)
 
   def value_container(self, value):
-    """Returns the container that this per-device `value` belongs to.
+    """Returns the container that this per-replica `value` belongs to.
 
     Args:
       value: A value returned by `call_for_each_replica()` or a variable
@@ -947,16 +898,11 @@ class DistributionStrategy(object):
 
   @property
   def num_replicas(self):
-    """Returns number of replicas, for purposes of averaging across replicas."""
-    raise NotImplementedError("must be implemented in descendants")
-
-  @property
-  def num_towers(self):
     """Returns number of replicas, for purposes of averaging across replicas.
 
-    DEPRECATED: use `num_replicas` instead.
+    DEPRECATED: use `num_replicas_in_sync` instead.
     """
-    return self.num_replicas
+    raise NotImplementedError("must be implemented in descendants")
 
   @property
   def num_replicas_in_sync(self):
@@ -1070,17 +1016,11 @@ class DistributionStrategy(object):
 class ReplicaContext(object):
   """DistributionStrategy API inside a `call_for_each_replica()` call."""
 
-  def __init__(self, distribution_strategy, replica_id=None, tower_id=None):
-    """`tower_id` is deprecated, use `replica_id` instead."""
-    if tower_id is not None:
-      replica_id = tower_id
-    assert replica_id is not None
+  def __init__(self, distribution_strategy, replica_id):
     self._distribution_strategy = distribution_strategy
     self._thread_context = distribution_strategy_context._InReplicaThreadMode(  # pylint: disable=protected-access
         self)
     self._replica_id = replica_id
-    # We keep a copy in _tower_id to ease the replica->tower transition.
-    self._tower_id = replica_id  # DEPRECATED
 
   def __enter__(self):
     _push_per_thread_mode(self._thread_context)
@@ -1107,19 +1047,33 @@ class ReplicaContext(object):
 
     Args:
       merge_fn: function that joins arguments from threads that are given as
-        PerDevice. It accepts `DistributionStrategy` object as the first
+        PerReplica. It accepts `DistributionStrategy` object as the first
         argument.
-      *args: positional per-thread arguments for `merge_fn`
-      **kwargs: keyword per-thread arguments for `merge_fn`.
+      args: List or tuple with positional per-thread arguments for `merge_fn`
+      kwargs: Dict with keyword per-thread arguments for `merge_fn`.
 
     Returns:
-      The return value of `merge_fn`, except for `PerDevice` values which are
+      The return value of `merge_fn`, except for `PerReplica` values which are
       unpacked.
     """
     require_replica_context(self)
-    return self._merge_call(merge_fn, *args, **kwargs)
+    # Handle old *args, **kwargs, and new args=(...), kwargs={...}, to
+    # allow transition.
+    a = kwargs.pop("args", None)
+    if a is not None:
+      if args:
+        raise ValueError(
+            "Can't pass *args and args=... to merge_call")
+      args = a
+    k = kwargs.pop("kwargs", None)
+    if k is not None:
+      if kwargs:
+        raise ValueError(
+            "Can't pass **kwargs and kwargs=... to merge_call")
+      kwargs = k
+    return self._merge_call(merge_fn, args, kwargs)
 
-  def _merge_call(self, merge_fn, *args, **kwargs):
+  def _merge_call(self, merge_fn, args, kwargs):
     """Default implementation for single replica."""
     _push_per_thread_mode(  # thread-local, so not needed with multiple threads
         distribution_strategy_context._CrossReplicaThreadMode(  # pylint: disable=protected-access
@@ -1129,34 +1083,19 @@ class ReplicaContext(object):
     finally:
       _pop_per_thread_mode()
 
-  @property
-  def is_single_replica(self):
-    """Returns whether there is a single replica or multiple."""
-    require_replica_context(self)
-    return self._distribution_strategy.is_single_replica
-
-  @property
-  def num_towers(self):
-    """Returns number of replicas, for purposes of averaging across replicas.
-
-    DEPRECATED: use `num_replicas` instead.
-    """
-    return self._distribution_strategy.num_replicas
-
   @property
   def num_replicas(self):
     """Returns number of replicas, for purposes of averaging across replicas."""
     return self._distribution_strategy.num_replicas
 
   @property
-  def replica_id(self):
-    """Which replica is being defined, a number from 0 to `num_replicas - 1`."""
-    require_replica_context(self)
-    return self._replica_id
+  def num_replicas_in_sync(self):
+    """Returns number of replicas over which gradients are aggregated."""
+    return self._distribution_strategy.num_replicas_in_sync
 
   @property
-  def tower_id(self):
-    """DEPRECATED: Use `replica_id` instead."""
+  def replica_id(self):
+    """Which replica is being defined, a number from 0 to `num_replicas - 1`."""
     require_replica_context(self)
     return self._replica_id
 
@@ -1167,9 +1106,14 @@ class ReplicaContext(object):
 
   @property
   def device(self):
-    """The device this replica is to be executed on, as a string."""
+    """BEING DELETED: use .devices instead."""
+    raise RuntimeError("Use .devices instead")
+
+  @property
+  def devices(self):
+    """The devices this replica is to be executed on, as a list of strings."""
     require_replica_context(self)
-    return device_util.current()
+    return [device_util.current()]
 
   # TODO(josh11b): Implement `start_all_reduce(method, t)` for efficient
   # all-reduce. It would return a function returning the result of reducing `t`
@@ -1213,9 +1157,7 @@ class _DefaultDistributionStrategy(DistributionStrategy):
     else:
       raise NotImplementedError("TODO")
 
-  def _call_for_each_replica(self, fn, *args, **kwargs):
-    # We don't run `fn` in multiple threads in _DefaultDistributionStrategy.
-    kwargs.pop("run_concurrently", None)
+  def _call_for_each_replica(self, fn, args, kwargs):
     with ReplicaContext(self, replica_id=0):
       return fn(*args, **kwargs)
 
diff --git a/tensorflow/python/training/distribute_test.py b/tensorflow/python/training/distribute_test.py
index 44c486c9976..0a7bbd56870 100644
--- a/tensorflow/python/training/distribute_test.py
+++ b/tensorflow/python/training/distribute_test.py
@@ -40,7 +40,7 @@ def _get_test_variable(name, synchronization, aggregation):
 
 class _TestStrategy(distribute.DistributionStrategy):
 
-  def _call_for_each_replica(self, fn, *args, **kwargs):
+  def _call_for_each_replica(self, fn, args, kwargs):
     with _TestReplicaContext(self, replica_id=0):
       return fn(*args, **kwargs)
 
diff --git a/tensorflow/python/training/distribution_strategy_context.py b/tensorflow/python/training/distribution_strategy_context.py
index d9559356f46..278f35b97e4 100644
--- a/tensorflow/python/training/distribution_strategy_context.py
+++ b/tensorflow/python/training/distribution_strategy_context.py
@@ -204,14 +204,3 @@ def _get_default_replica_mode():
   if _defaults["replica_mode"] is None:
     _defaults["replica_mode"] = _DefaultReplicaThreadMode()
   return _defaults["replica_mode"]
-
-
-#-------------------------------------------------------------------------------
-# For compatibility during the tower -> replica transistion.
-_CrossTowerThreadMode = _CrossReplicaThreadMode
-_InTowerThreadMode = _InReplicaThreadMode
-_DefaultTowerThreadMode = _DefaultReplicaThreadMode
-get_tower_context = get_replica_context
-get_cross_tower_context = get_cross_replica_context
-_get_default_tower_context = _get_default_replica_context
-_get_default_tower_mode = _get_default_replica_mode
diff --git a/tensorflow/python/training/monitored_session.py b/tensorflow/python/training/monitored_session.py
index 0687eb5d4bc..162fef971db 100644
--- a/tensorflow/python/training/monitored_session.py
+++ b/tensorflow/python/training/monitored_session.py
@@ -346,7 +346,7 @@ def _create_monitored_session_with_worker_context(worker_context,  # pylint: dis
       stop_grace_period_secs=stop_grace_period_secs)
 
 
-@tf_export('train.MonitoredTrainingSession')
+@tf_export(v1=['train.MonitoredTrainingSession'])
 def MonitoredTrainingSession(master='',  # pylint: disable=invalid-name
                              is_chief=True,
                              checkpoint_dir=None,
diff --git a/tensorflow/python/training/optimizer.py b/tensorflow/python/training/optimizer.py
index 8e400f2aeba..9dfa9d2afb2 100644
--- a/tensorflow/python/training/optimizer.py
+++ b/tensorflow/python/training/optimizer.py
@@ -465,11 +465,7 @@ class Optimizer(
         # Have to be careful to call distribute_lib.get_loss_reduction()
         # *after* loss() is evaluated, so we know what loss reduction it uses.
         # TODO(josh11b): Test that we handle weight decay in a reasonable way.
-        if (distribute_lib.get_loss_reduction() ==
-            variable_scope.VariableAggregation.MEAN):
-          num_replicas = distribute_ctx.get_distribution_strategy().num_replicas
-          if num_replicas > 1:
-            loss_value *= (1. / num_replicas)
+        loss_value = self._scale_loss(loss_value)
 
       if var_list is None:
         var_list = tape.watched_variables()
@@ -486,11 +482,7 @@ class Optimizer(
           "be a function when eager execution is enabled.")
 
     # Scale loss if using a "mean" loss reduction and multiple replicas.
-    if (distribute_lib.get_loss_reduction() ==
-        variable_scope.VariableAggregation.MEAN):
-      num_replicas = distribute_ctx.get_distribution_strategy().num_replicas
-      if num_replicas > 1:
-        loss *= (1. / num_replicas)
+    loss = self._scale_loss(loss)
 
     if gate_gradients not in [Optimizer.GATE_NONE, Optimizer.GATE_OP,
                               Optimizer.GATE_GRAPH]:
@@ -526,6 +518,16 @@ class Optimizer(
          if g is not None and v.dtype != dtypes.resource])
     return grads_and_vars
 
+  @staticmethod
+  def _scale_loss(loss_value):
+    if (distribute_lib.get_loss_reduction() ==
+        variable_scope.VariableAggregation.MEAN):
+      num_replicas = \
+        distribute_ctx.get_distribution_strategy().num_replicas_in_sync
+      if num_replicas > 1:
+        loss_value *= (1. / num_replicas)
+    return loss_value
+
   def apply_gradients(self, grads_and_vars, global_step=None, name=None):
     """Apply gradients to variables.
 
diff --git a/tensorflow/python/training/saver.py b/tensorflow/python/training/saver.py
index ccbb1c6b574..a29926a57df 100644
--- a/tensorflow/python/training/saver.py
+++ b/tensorflow/python/training/saver.py
@@ -781,8 +781,12 @@ class BaseSaverBuilder(object):
 
     with ops.name_scope(name, "save",
                         [saveable.op for saveable in saveables]) as name:
-      # Add the Constant string tensor for the filename.
-      filename_tensor = constant_op.constant(filename or "model")
+      # Add a placeholder string tensor for the filename.
+      filename_tensor = array_ops.placeholder_with_default(
+          filename or "model", shape=(), name="filename")
+      # Keep the name "Const" for backwards compatibility.
+      filename_tensor = array_ops.placeholder_with_default(
+          filename_tensor, shape=(), name="Const")
 
       # Add the save ops.
       if sharded:
diff --git a/tensorflow/python/training/saver_test.py b/tensorflow/python/training/saver_test.py
index efb464410bd..eb2690985d5 100644
--- a/tensorflow/python/training/saver_test.py
+++ b/tensorflow/python/training/saver_test.py
@@ -1832,8 +1832,8 @@ class MetaGraphTest(test.TestCase):
       self.assertEqual(1, len(savers.value))
 
       # Verifies that saver0 graph nodes are omitted from the saver1 export
-      self.assertEqual(29, len(meta_graph_def0.graph_def.node))
-      self.assertEqual(19, len(meta_graph_def1.graph_def.node))
+      self.assertEqual(33, len(meta_graph_def0.graph_def.node))
+      self.assertEqual(21, len(meta_graph_def1.graph_def.node))
 
   def testBinaryAndTextFormat(self):
     test_dir = self._get_test_dir("binary_and_text")
@@ -2140,13 +2140,14 @@ class MetaGraphTest(test.TestCase):
       ops = [o.name for o in meta_graph_def.meta_info_def.stripped_op_list.op]
       if save._write_version is saver_pb2.SaverDef.V1:
         self.assertEqual(ops, [
-            "Add", "Assign", "Const", "Identity", "NoOp", "RestoreV2",
-            "SaveSlices", "Sub", "VariableV2"
+            "Add", "Assign", "Const", "Identity", "NoOp",
+            "PlaceholderWithDefault", "RestoreV2", "SaveSlices", "Sub",
+            "VariableV2"
         ])
       else:
         self.assertEqual(ops, [
-            "Add", "Assign", "Const", "Identity", "NoOp", "RestoreV2", "SaveV2",
-            "Sub", "VariableV2"
+            "Add", "Assign", "Const", "Identity", "NoOp",
+            "PlaceholderWithDefault", "RestoreV2", "SaveV2", "Sub", "VariableV2"
         ])
 
       # Test calling stripped_op_list_for_graph directly
diff --git a/tensorflow/python/training/server_lib.i b/tensorflow/python/training/server_lib.i
deleted file mode 100644
index 94250304f85..00000000000
--- a/tensorflow/python/training/server_lib.i
+++ /dev/null
@@ -1,120 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-%nothread tensorflow::ServerInterface::Join;
-
-%include "tensorflow/python/platform/base.i"
-
-//%newobject tensorflow::NewServer;
-
-%typemap(in) const ServerDef& (tensorflow::ServerDef temp) {
-  char* c_string;
-  Py_ssize_t py_size;
-  if (PyBytes_AsStringAndSize($input, &c_string, &py_size) == -1) {
-    // Python has raised an error (likely TypeError or UnicodeEncodeError).
-    SWIG_fail;
-  }
-
-  if (!temp.ParseFromString(string(c_string, py_size))) {
-    PyErr_SetString(
-        PyExc_TypeError,
-        "The ServerDef could not be parsed as a valid protocol buffer");
-    SWIG_fail;
-  }
-  $1 = &temp;
-}
-
-%typemap(in, numinputs=0)
-    std::unique_ptr<tensorflow::ServerInterface>* out_server (
-        std::unique_ptr<tensorflow::ServerInterface> temp) {
-  $1 = &temp;
-}
-
-%typemap(argout) std::unique_ptr<tensorflow::ServerInterface>* out_server {
-  // TODO(mrry): Convert this to SWIG_POINTER_OWN when the issues with freeing
-  // a server are fixed.
-  $result = SWIG_NewPointerObj($1->release(),
-                               $descriptor(tensorflow::ServerInterface*),
-                               0);
-}
-
-%feature("except") tensorflow::ServerInterface::Join {
-  // Let other threads run while we wait for the server to shut down.
-  Py_BEGIN_ALLOW_THREADS
-  $action
-  Py_END_ALLOW_THREADS
-}
-
-%{
-#include "tensorflow/c/tf_status_helper.h"
-#include "tensorflow/core/distributed_runtime/server_lib.h"
-#include "tensorflow/core/lib/core/status.h"
-
-using tensorflow::ServerDef;
-
-static void PyServer_New(const ServerDef& server_def,
-                         std::unique_ptr<tensorflow::ServerInterface>* out_server,
-                         TF_Status* out_status) {
-  tensorflow::Status status =
-      tensorflow::NewServer(server_def, out_server);
-  tensorflow::Set_TF_Status_from_Status(out_status, status);
-}
-
-static void PyServer_Start(
-    tensorflow::ServerInterface* in_server,
-    TF_Status* out_status) {
-  tensorflow::Set_TF_Status_from_Status(out_status, in_server->Start());
-}
-
-static void PyServer_Stop(
-    tensorflow::ServerInterface* in_server,
-    TF_Status* out_status) {
-  tensorflow::Set_TF_Status_from_Status(out_status, in_server->Stop());
-}
-
-static void PyServer_Join(
-    tensorflow::ServerInterface* in_server,
-    TF_Status* out_status) {
-  tensorflow::Set_TF_Status_from_Status(out_status, in_server->Join());
-}
-%}
-
-// Wrap this function.
-void PyServer_New(const ServerDef& server_def,
-                  std::unique_ptr<tensorflow::ServerInterface>* out_server,
-                  TF_Status* out_status);
-void PyServer_Start(tensorflow::ServerInterface* in_server,
-                    TF_Status* out_status);
-void PyServer_Stop(tensorflow::ServerInterface* in_server,
-                   TF_Status* out_status);
-void PyServer_Join(tensorflow::ServerInterface* in_server,
-                   TF_Status* out_status);
-
-%ignoreall
-
-%unignore tensorflow;
-%unignore tensorflow::ServerDef;
-%unignore tensorflow::ServerInterface;
-%unignore tensorflow::ServerInterface::~ServerInterface;
-%unignore tensorflow::ServerInterface::target;
-
-%unignore PyServer_New;
-%unignore PyServer_Start;
-%unignore PyServer_Stop;
-%unignore PyServer_Join;
-
-%include "tensorflow/core/distributed_runtime/server_lib.h"
-
-%unignoreall
diff --git a/tensorflow/python/training/server_lib.py b/tensorflow/python/training/server_lib.py
index 46543413e40..302ca2dd44b 100644
--- a/tensorflow/python/training/server_lib.py
+++ b/tensorflow/python/training/server_lib.py
@@ -20,7 +20,7 @@ from __future__ import print_function
 
 from tensorflow.core.protobuf import cluster_pb2
 from tensorflow.core.protobuf import tensorflow_server_pb2
-from tensorflow.python import pywrap_tensorflow
+from tensorflow.python import pywrap_tensorflow as c_api
 from tensorflow.python.framework import errors
 from tensorflow.python.util import compat
 from tensorflow.python.util.tf_export import tf_export
@@ -143,12 +143,24 @@ class Server(object):
     """
     self._server_def = _make_server_def(server_or_cluster_def,
                                         job_name, task_index, protocol, config)
-    with errors.raise_exception_on_not_ok_status() as status:
-      self._server = pywrap_tensorflow.PyServer_New(
-          self._server_def.SerializeToString(), status)
+    self._server = c_api.TF_NewServer(self._server_def.SerializeToString())
     if start:
       self.start()
 
+  def __del__(self):
+    try:
+      c_api.TF_ServerStop(self._server)
+      # Clean shutdown of servers is not yet implemented, so
+      # we leak instead of calling c_api.TF_DeleteServer here.
+      # See:
+      # https://github.com/tensorflow/tensorflow/blob/0495317a6e9dd4cac577b9d5cf9525e62b571018/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h#L73
+    except errors.UnimplementedError:
+      pass
+    except AttributeError:
+      # At shutdown, `c_api` may have been garbage collected.
+      pass
+    self._server = None
+
   def start(self):
     """Starts this server.
 
@@ -156,8 +168,7 @@ class Server(object):
       tf.errors.OpError: Or one of its subclasses if an error occurs while
         starting the TensorFlow server.
     """
-    with errors.raise_exception_on_not_ok_status() as status:
-      pywrap_tensorflow.PyServer_Start(self._server, status)
+    c_api.TF_ServerStart(self._server)
 
   def join(self):
     """Blocks until the server has shut down.
@@ -168,8 +179,7 @@ class Server(object):
       tf.errors.OpError: Or one of its subclasses if an error occurs while
         joining the TensorFlow server.
     """
-    with errors.raise_exception_on_not_ok_status() as status:
-      pywrap_tensorflow.PyServer_Join(self._server, status)
+    c_api.TF_ServerJoin(self._server)
 
   @property
   def server_def(self):
@@ -198,7 +208,7 @@ class Server(object):
     Returns:
       A string containing a session target for this server.
     """
-    return self._server.target()
+    return c_api.TF_ServerTarget(self._server)
 
   @staticmethod
   def create_local_server(config=None, start=True):
diff --git a/tensorflow/python/training/sync_replicas_optimizer.py b/tensorflow/python/training/sync_replicas_optimizer.py
index 6a3756fba9f..fbde8fe3c2a 100644
--- a/tensorflow/python/training/sync_replicas_optimizer.py
+++ b/tensorflow/python/training/sync_replicas_optimizer.py
@@ -31,6 +31,7 @@ from tensorflow.python.training import optimizer
 from tensorflow.python.training import queue_runner
 from tensorflow.python.training import session_manager
 from tensorflow.python.training import session_run_hook
+from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -39,7 +40,7 @@ from tensorflow.python.util.tf_export import tf_export
 # rate according to the number of replicas. This change is introduced to be
 # consistent with how gradients are aggregated (averaged) within a batch in a
 # replica.
-@tf_export("train.SyncReplicasOptimizer")
+@tf_export(v1=["train.SyncReplicasOptimizer"])
 class SyncReplicasOptimizer(optimizer.Optimizer):
   """Class to synchronize, aggregate gradients and pass them to the optimizer.
 
@@ -139,6 +140,12 @@ class SyncReplicasOptimizer(optimizer.Optimizer):
   ```
   """
 
+  @deprecation.deprecated(
+      None,
+      "The `SyncReplicaOptimizer` is deprecated. For synchrononous training, "
+      "please use [Distribution Strategies](https://github.com/tensorflow/"
+      "tensorflow/tree/master/tensorflow/contrib/distribute).",
+      warn_once=True)
   def __init__(self,
                opt,
                replicas_to_aggregate,
diff --git a/tensorflow/python/training/training_util.py b/tensorflow/python/training/training_util.py
index 77a420fb732..86f1b4d5aae 100644
--- a/tensorflow/python/training/training_util.py
+++ b/tensorflow/python/training/training_util.py
@@ -39,7 +39,7 @@ GLOBAL_STEP_READ_KEY = 'global_step_read_op_cache'
 write_graph = graph_io.write_graph
 
 
-@tf_export('train.global_step')
+@tf_export(v1=['train.global_step'])
 def global_step(sess, global_step_tensor):
   """Small helper to get the global step.
 
@@ -69,7 +69,7 @@ def global_step(sess, global_step_tensor):
   return int(sess.run(global_step_tensor))
 
 
-@tf_export('train.get_global_step')
+@tf_export(v1=['train.get_global_step'])
 def get_global_step(graph=None):
   """Get the global step tensor.
 
@@ -104,7 +104,7 @@ def get_global_step(graph=None):
   return global_step_tensor
 
 
-@tf_export('train.create_global_step')
+@tf_export(v1=['train.create_global_step'])
 def create_global_step(graph=None):
   """Create global step tensor in graph.
 
@@ -145,7 +145,7 @@ def create_global_step(graph=None):
                      ops.GraphKeys.GLOBAL_STEP])
 
 
-@tf_export('train.get_or_create_global_step')
+@tf_export(v1=['train.get_or_create_global_step'])
 def get_or_create_global_step(graph=None):
   """Returns and create (if necessary) the global step tensor.
 
@@ -163,7 +163,7 @@ def get_or_create_global_step(graph=None):
   return global_step_tensor
 
 
-@tf_export('train.assert_global_step')
+@tf_export(v1=['train.assert_global_step'])
 def assert_global_step(global_step_tensor):
   """Asserts `global_step_tensor` is a scalar int `Variable` or `Tensor`.
 
diff --git a/tensorflow/python/user_ops/user_ops.py b/tensorflow/python/user_ops/user_ops.py
index 20ea3b0f621..3dbacd09e62 100644
--- a/tensorflow/python/user_ops/user_ops.py
+++ b/tensorflow/python/user_ops/user_ops.py
@@ -26,7 +26,7 @@ from tensorflow.python.ops.gen_user_ops import *  # pylint: disable=wildcard-imp
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export('user_ops.my_fact')
+@tf_export(v1=['user_ops.my_fact'])
 def my_fact():
   """Example of overriding the generated code for an Op."""
   return _gen_user_ops.fact()
diff --git a/tensorflow/python/util/decorator_utils.py b/tensorflow/python/util/decorator_utils.py
index 7b4363c0e40..ab9641d96bc 100644
--- a/tensorflow/python/util/decorator_utils.py
+++ b/tensorflow/python/util/decorator_utils.py
@@ -75,13 +75,31 @@ def _normalize_docstring(docstring):
 
 def add_notice_to_docstring(
     doc, instructions, no_doc_str, suffix_str, notice):
-  """Adds a deprecation notice to a docstring."""
+  """Adds a deprecation notice to a docstring.
+
+  Args:
+    doc: The original docstring.
+    instructions: A string, describing how to fix the problem.
+    no_doc_str: The default value to use for `doc` if `doc` is empty.
+    suffix_str: Is added to the end of the first line.
+    notice: A list of strings. The main notice warning body.
+
+  Returns:
+    A new docstring, with the notice attached.
+
+  Raises:
+    ValueError: If `notice` is empty.
+  """
   if not doc:
     lines = [no_doc_str]
   else:
     lines = _normalize_docstring(doc).splitlines()
     lines[0] += ' ' + suffix_str
 
+  if not notice:
+    raise ValueError('The `notice` arg must not be empty.')
+
+  notice[0] = 'Warning: ' + notice[0]
   notice = [''] + notice + ([instructions] if instructions else [])
 
   if len(lines) > 1:
diff --git a/tensorflow/python/util/decorator_utils_test.py b/tensorflow/python/util/decorator_utils_test.py
index 64e0cc7f57e..440dcbb6df3 100644
--- a/tensorflow/python/util/decorator_utils_test.py
+++ b/tensorflow/python/util/decorator_utils_test.py
@@ -55,8 +55,9 @@ class AddNoticeToDocstringTest(test.TestCase):
         expected)
 
   def test_regular(self):
-    expected = ("Brief (suffix)\n\nGo away\nInstructions\n\nDocstring\n\n"
-                "Args:\n  arg1: desc")
+    expected = (
+        "Brief (suffix)\n\nWarning: Go away\nInstructions\n\nDocstring\n\n"
+        "Args:\n  arg1: desc")
     # No indent for main docstring
     self._check("Brief\n\nDocstring\n\nArgs:\n  arg1: desc", expected)
     # 2 space indent for main docstring, blank lines not indented
@@ -71,7 +72,7 @@ class AddNoticeToDocstringTest(test.TestCase):
                 expected)
 
   def test_brief_only(self):
-    expected = "Brief (suffix)\n\nGo away\nInstructions"
+    expected = "Brief (suffix)\n\nWarning: Go away\nInstructions"
     self._check("Brief", expected)
     self._check("Brief\n", expected)
     self._check("Brief\n  ", expected)
@@ -79,12 +80,12 @@ class AddNoticeToDocstringTest(test.TestCase):
     self._check("\n  Brief\n  ", expected)
 
   def test_no_docstring(self):
-    expected = "Nothing here\n\nGo away\nInstructions"
+    expected = "Nothing here\n\nWarning: Go away\nInstructions"
     self._check(None, expected)
     self._check("", expected)
 
   def test_no_empty_line(self):
-    expected = "Brief (suffix)\n\nGo away\nInstructions\n\nDocstring"
+    expected = "Brief (suffix)\n\nWarning: Go away\nInstructions\n\nDocstring"
     # No second line indent
     self._check("Brief\nDocstring", expected)
     # 2 space second line indent
diff --git a/tensorflow/python/util/deprecation.py b/tensorflow/python/util/deprecation.py
index c43589f5c45..4c68d1aaae3 100644
--- a/tensorflow/python/util/deprecation.py
+++ b/tensorflow/python/util/deprecation.py
@@ -54,16 +54,39 @@ def _add_deprecated_function_notice_to_docstring(doc, date, instructions):
       '(deprecated)', main_text)
 
 
-def _add_deprecated_arg_notice_to_docstring(doc, date, instructions):
+def _add_deprecated_arg_notice_to_docstring(doc, date, instructions,
+                                            deprecated_names):
   """Adds a deprecation notice to a docstring for deprecated arguments."""
+
+  deprecation_string = ', '.join(sorted(deprecated_names))
+
   return decorator_utils.add_notice_to_docstring(
-      doc, instructions,
-      'DEPRECATED FUNCTION ARGUMENTS',
+      doc, instructions, 'DEPRECATED FUNCTION ARGUMENTS',
       '(deprecated arguments)', [
-          'SOME ARGUMENTS ARE DEPRECATED. '
-          'They will be removed %s.' % (
-              'in a future version' if date is None else ('after %s' % date)),
-          'Instructions for updating:'])
+          'SOME ARGUMENTS ARE DEPRECATED: `(%s)`. '
+          'They will be removed %s.' %
+          (deprecation_string, 'in a future version' if date is None else
+           ('after %s' % date)), 'Instructions for updating:'
+      ])
+
+
+def _add_deprecated_arg_value_notice_to_docstring(doc, date, instructions,
+                                                  deprecated_name_value_dict):
+  """Adds a deprecation notice to a docstring for deprecated arguments."""
+
+  deprecation_string = ', '.join(
+      '%s=%r' % (key, value)
+      for key, value in sorted(deprecated_name_value_dict.items()))
+
+  when = 'in a future version' if date is None else ('after %s' % date)
+
+  return decorator_utils.add_notice_to_docstring(
+      doc, instructions, 'DEPRECATED FUNCTION ARGUMENT VALUES',
+      '(deprecated argument values)', [
+          'SOME ARGUMENT VALUES ARE DEPRECATED: `(%s)`. '
+          'They will be removed %s.' % (deprecation_string, when),
+          'Instructions for updating:'
+      ])
 
 
 def _validate_deprecation_args(date, instructions):
@@ -403,10 +426,11 @@ def deprecated_args(date, instructions, *deprecated_arg_names_or_tuples,
             pos, spec.has_ok_value, spec.ok_value)
     return deprecated_positional_args
 
+  deprecated_arg_names = _get_arg_names_to_ok_vals()
+
   def deprecated_wrapper(func):
     """Deprecation decorator."""
     decorator_utils.validate_callable(func, 'deprecated_args')
-    deprecated_arg_names = _get_arg_names_to_ok_vals()
 
     arg_spec = tf_inspect.getfullargspec(func)
     deprecated_positions = _get_deprecated_positional_arguments(
@@ -486,9 +510,11 @@ def deprecated_args(date, instructions, *deprecated_arg_names_or_tuples,
                 'in a future version' if date is None else ('after %s' % date),
                 instructions)
       return func(*args, **kwargs)
-    return tf_decorator.make_decorator(func, new_func, 'deprecated',
-                                       _add_deprecated_arg_notice_to_docstring(
-                                           func.__doc__, date, instructions))
+
+    doc = _add_deprecated_arg_notice_to_docstring(
+        func.__doc__, date, instructions, sorted(deprecated_arg_names.keys()))
+    return tf_decorator.make_decorator(func, new_func, 'deprecated', doc)
+
   return deprecated_wrapper
 
 
@@ -551,9 +577,11 @@ def deprecated_arg_values(date, instructions, warn_once=True,
                   func.__module__, arg_name, arg_value, 'in a future version'
                   if date is None else ('after %s' % date), instructions)
       return func(*args, **kwargs)
-    return tf_decorator.make_decorator(func, new_func, 'deprecated',
-                                       _add_deprecated_arg_notice_to_docstring(
-                                           func.__doc__, date, instructions))
+
+    doc = _add_deprecated_arg_value_notice_to_docstring(
+        func.__doc__, date, instructions, deprecated_kwargs)
+    return tf_decorator.make_decorator(func, new_func, 'deprecated', doc)
+
   return deprecated_wrapper
 
 
diff --git a/tensorflow/python/util/deprecation_test.py b/tensorflow/python/util/deprecation_test.py
index 90c73a0a58d..34cbca52a1b 100644
--- a/tensorflow/python/util/deprecation_test.py
+++ b/tensorflow/python/util/deprecation_test.py
@@ -153,7 +153,8 @@ class DeprecationTest(test.TestCase):
     self.assertEqual(
         "fn doc. (deprecated)"
         "\n"
-        "\nTHIS FUNCTION IS DEPRECATED. It will be removed in a future version."
+        "\nWarning: THIS FUNCTION IS DEPRECATED. "
+        "It will be removed in a future version."
         "\nInstructions for updating:\n%s"
         "\n"
         "\nArgs:"
@@ -195,7 +196,7 @@ class DeprecationTest(test.TestCase):
     self.assertEqual(
         "fn doc. (deprecated)"
         "\n"
-        "\nTHIS FUNCTION IS DEPRECATED. It will be removed after %s."
+        "\nWarning: THIS FUNCTION IS DEPRECATED. It will be removed after %s."
         "\nInstructions for updating:\n%s"
         "\n"
         "\nArgs:"
@@ -227,7 +228,7 @@ class DeprecationTest(test.TestCase):
     self.assertEqual(
         "fn doc. (deprecated)"
         "\n"
-        "\nTHIS FUNCTION IS DEPRECATED. It will be removed after %s."
+        "\nWarning: THIS FUNCTION IS DEPRECATED. It will be removed after %s."
         "\nInstructions for updating:\n%s" % (date, instructions), _fn.__doc__)
 
     # Assert calling new fn issues log warning.
@@ -251,7 +252,7 @@ class DeprecationTest(test.TestCase):
     self.assertEqual(
         "DEPRECATED FUNCTION"
         "\n"
-        "\nTHIS FUNCTION IS DEPRECATED. It will be removed after %s."
+        "\nWarning: THIS FUNCTION IS DEPRECATED. It will be removed after %s."
         "\nInstructions for updating:"
         "\n%s" % (date, instructions), _fn.__doc__)
 
@@ -289,7 +290,7 @@ class DeprecationTest(test.TestCase):
     self.assertEqual(
         "fn doc. (deprecated)"
         "\n"
-        "\nTHIS FUNCTION IS DEPRECATED. It will be removed after %s."
+        "\nWarning: THIS FUNCTION IS DEPRECATED. It will be removed after %s."
         "\nInstructions for updating:\n%s"
         "\n"
         "\nArgs:"
@@ -326,7 +327,7 @@ class DeprecationTest(test.TestCase):
     self.assertEqual(
         "fn doc. (deprecated)"
         "\n"
-        "\nTHIS FUNCTION IS DEPRECATED. It will be removed after %s."
+        "\nWarning: THIS FUNCTION IS DEPRECATED. It will be removed after %s."
         "\nInstructions for updating:\n%s" % (date, instructions),
         getattr(_Object, "_fn").__doc__)
 
@@ -355,9 +356,10 @@ class DeprecationTest(test.TestCase):
     self.assertEqual(
         "DEPRECATED FUNCTION"
         "\n"
-        "\nTHIS FUNCTION IS DEPRECATED. It will be removed after %s."
+        "\nWarning: THIS FUNCTION IS DEPRECATED. It will be removed after %s."
         "\nInstructions for updating:"
-        "\n%s" % (date, instructions), getattr(_Object, "_fn").__doc__)
+        "\n%s" % (date, instructions),
+        getattr(_Object, "_fn").__doc__)
 
     # Assert calling new fn issues log warning.
     self.assertEqual(3, _Object()._fn(1, 2))
@@ -406,12 +408,13 @@ class DeprecationTest(test.TestCase):
     self.assertEqual(
         "prop doc. (deprecated)"
         "\n"
-        "\nTHIS FUNCTION IS DEPRECATED. It will be removed after %s."
+        "\nWarning: THIS FUNCTION IS DEPRECATED. It will be removed after %s."
         "\nInstructions for updating:"
         "\n%s"
         "\n"
         "\nReturns:"
-        "\n  String." % (date, instructions), getattr(_Object, "_prop").__doc__)
+        "\n  String." % (date, instructions),
+        getattr(_Object, "_prop").__doc__)
 
     # Assert calling new fn issues log warning.
     self.assertEqual("prop_with_doc", _Object()._prop)
@@ -439,9 +442,10 @@ class DeprecationTest(test.TestCase):
     self.assertEqual(
         "DEPRECATED FUNCTION"
         "\n"
-        "\nTHIS FUNCTION IS DEPRECATED. It will be removed after %s."
+        "\nWarning: THIS FUNCTION IS DEPRECATED. It will be removed after %s."
         "\nInstructions for updating:"
-        "\n%s" % (date, instructions), getattr(_Object, "_prop").__doc__)
+        "\n%s" % (date, instructions),
+        getattr(_Object, "_prop").__doc__)
 
     # Assert calling new fn issues log warning.
     self.assertEqual("prop_no_doc", _Object()._prop)
@@ -507,7 +511,8 @@ class DeprecatedArgsTest(test.TestCase):
     self.assertEqual(
         "fn doc. (deprecated arguments)"
         "\n"
-        "\nSOME ARGUMENTS ARE DEPRECATED. They will be removed after %s."
+        "\nWarning: SOME ARGUMENTS ARE DEPRECATED: `(deprecated)`. "
+        "They will be removed after %s."
         "\nInstructions for updating:\n%s"
         "\n"
         "\nArgs:"
@@ -544,7 +549,8 @@ class DeprecatedArgsTest(test.TestCase):
     self.assertEqual(
         "fn doc. (deprecated arguments)"
         "\n"
-        "\nSOME ARGUMENTS ARE DEPRECATED. They will be removed after %s."
+        "\nWarning: SOME ARGUMENTS ARE DEPRECATED: `(deprecated)`. "
+        "They will be removed after %s."
         "\nInstructions for updating:\n%s" % (date, instructions), _fn.__doc__)
 
     # Assert calls without the deprecated argument log nothing.
@@ -572,7 +578,8 @@ class DeprecatedArgsTest(test.TestCase):
     self.assertEqual(
         "DEPRECATED FUNCTION ARGUMENTS"
         "\n"
-        "\nSOME ARGUMENTS ARE DEPRECATED. They will be removed after %s."
+        "\nWarning: SOME ARGUMENTS ARE DEPRECATED: `(deprecated)`. "
+        "They will be removed after %s."
         "\nInstructions for updating:"
         "\n%s" % (date, instructions), _fn.__doc__)
 
@@ -767,9 +774,10 @@ class DeprecatedArgValuesTest(test.TestCase):
     # Assert function docs are properly updated.
     self.assertEqual("_fn", _fn.__name__)
     self.assertEqual(
-        "fn doc. (deprecated arguments)"
+        "fn doc. (deprecated argument values)"
         "\n"
-        "\nSOME ARGUMENTS ARE DEPRECATED. They will be removed after %s."
+        "\nWarning: SOME ARGUMENT VALUES ARE DEPRECATED: `(deprecated=True)`. "
+        "They will be removed after %s."
         "\nInstructions for updating:\n%s"
         "\n"
         "\nArgs:"
@@ -809,9 +817,10 @@ class DeprecatedArgValuesTest(test.TestCase):
     # Assert function docs are properly updated.
     self.assertEqual("_fn", _fn.__name__)
     self.assertEqual(
-        "fn doc. (deprecated arguments)"
+        "fn doc. (deprecated argument values)"
         "\n"
-        "\nSOME ARGUMENTS ARE DEPRECATED. They will be removed after %s."
+        "\nWarning: SOME ARGUMENT VALUES ARE DEPRECATED: `(deprecated=True)`. "
+        "They will be removed after %s."
         "\nInstructions for updating:\n%s" % (date, instructions), _fn.__doc__)
 
     # Assert calling new fn with non-deprecated value logs nothing.
@@ -842,9 +851,10 @@ class DeprecatedArgValuesTest(test.TestCase):
     # Assert function docs are properly updated.
     self.assertEqual("_fn", _fn.__name__)
     self.assertEqual(
-        "DEPRECATED FUNCTION ARGUMENTS"
+        "DEPRECATED FUNCTION ARGUMENT VALUES"
         "\n"
-        "\nSOME ARGUMENTS ARE DEPRECATED. They will be removed after %s."
+        "\nWarning: SOME ARGUMENT VALUES ARE DEPRECATED: `(deprecated=True)`. "
+        "They will be removed after %s."
         "\nInstructions for updating:"
         "\n%s" % (date, instructions), _fn.__doc__)
 
diff --git a/tensorflow/python/util/nest.py b/tensorflow/python/util/nest.py
index d67dbde3047..be8b0f1949f 100644
--- a/tensorflow/python/util/nest.py
+++ b/tensorflow/python/util/nest.py
@@ -503,7 +503,8 @@ def assert_shallow_structure(shallow_tree, input_tree, check_types=True):
               "The two namedtuples don't have the same sequence type. Input "
               "structure has type %s, while shallow structure has type %s."
               % (type(input_tree), type(shallow_tree)))
-      else:
+      elif not (isinstance(shallow_tree, _collections.Mapping)
+                and isinstance(input_tree, _collections.Mapping)):
         raise TypeError(
             "The two structures don't have the same sequence type. Input "
             "structure has type %s, while shallow structure has type %s."
diff --git a/tensorflow/python/util/nest_test.py b/tensorflow/python/util/nest_test.py
index e03a8daaa19..997a3c5c36f 100644
--- a/tensorflow/python/util/nest_test.py
+++ b/tensorflow/python/util/nest_test.py
@@ -706,6 +706,40 @@ class NestTest(parameterized.TestCase, test.TestCase):
         name_list, data_list)
     self.assertEqual(out, ["first_4_evens", ["first_5_odds", "first_3_primes"]])
 
+    # Dicts.
+    inp_val = dict(a=2, b=3)
+    inp_ops = dict(a=dict(add=1, mul=2), b=dict(add=2, mul=3))
+    out = nest.map_structure_up_to(
+        inp_val,
+        lambda val, ops: (val + ops["add"]) * ops["mul"], inp_val, inp_ops)
+    self.assertEqual(out["a"], 6)
+    self.assertEqual(out["b"], 15)
+
+    # Non-equal dicts.
+    inp_val = dict(a=2, b=3)
+    inp_ops = dict(a=dict(add=1, mul=2), c=dict(add=2, mul=3))
+    with self.assertRaisesRegexp(ValueError, "same keys"):
+      nest.map_structure_up_to(
+          inp_val,
+          lambda val, ops: (val + ops["add"]) * ops["mul"], inp_val, inp_ops)
+
+    # Dict+custom mapping.
+    inp_val = dict(a=2, b=3)
+    inp_ops = _CustomMapping(a=dict(add=1, mul=2), b=dict(add=2, mul=3))
+    out = nest.map_structure_up_to(
+        inp_val,
+        lambda val, ops: (val + ops["add"]) * ops["mul"], inp_val, inp_ops)
+    self.assertEqual(out["a"], 6)
+    self.assertEqual(out["b"], 15)
+
+    # Non-equal dict/mapping.
+    inp_val = dict(a=2, b=3)
+    inp_ops = _CustomMapping(a=dict(add=1, mul=2), c=dict(add=2, mul=3))
+    with self.assertRaisesRegexp(ValueError, "same keys"):
+      nest.map_structure_up_to(
+          inp_val,
+          lambda val, ops: (val + ops["add"]) * ops["mul"], inp_val, inp_ops)
+
   def testGetTraverseShallowStructure(self):
     scalar_traverse_input = [3, 4, (1, 2, [0]), [5, 6], {"a": (7,)}, []]
     scalar_traverse_r = nest.get_traverse_shallow_structure(
diff --git a/tensorflow/python/util/tf_decorator.py b/tensorflow/python/util/tf_decorator.py
index 3d837a40449..0cfc836246d 100644
--- a/tensorflow/python/util/tf_decorator.py
+++ b/tensorflow/python/util/tf_decorator.py
@@ -101,6 +101,55 @@ def make_decorator(target,
   return decorator_func
 
 
+def rewrap(decorator_func, previous_target, new_target):
+  """Injects a new target into a function built by make_decorator.
+
+  This function allows replacing a function wrapped by `decorator_func`,
+  assuming the decorator that wraps the function is written as described below.
+
+  The decorator function must use `<decorator name>.__wrapped__` instead of the
+  wrapped function that is normally used:
+
+  Example:
+
+      # Instead of this:
+      def simple_parametrized_wrapper(*args, **kwds):
+        return wrapped_fn(*args, **kwds)
+
+      tf_decorator.make_decorator(simple_parametrized_wrapper, wrapped_fn)
+
+      # Write this:
+      def simple_parametrized_wrapper(*args, **kwds):
+        return simple_parametrized_wrapper.__wrapped__(*args, **kwds)
+
+      tf_decorator.make_decorator(simple_parametrized_wrapper, wrapped_fn)
+
+  Note that this process modifies decorator_func.
+
+  Args:
+    decorator_func: Callable returned by `wrap`.
+    previous_target: Callable that needs to be replaced.
+    new_target: Callable to replace previous_target with.
+  """
+  # Because the process mutates the decorator, we only need to alter the
+  # innermost function that wraps previous_target.
+  cur = decorator_func
+  innermost_decorator = None
+  target = None
+  while hasattr(cur, '_tf_decorator'):
+    innermost_decorator = cur
+    target = getattr(cur, '_tf_decorator')
+    if target.decorated_target is previous_target:
+      break
+    cur = target.decorated_target
+
+  if innermost_decorator is None:
+    return
+
+  target.decorated_target = new_target
+  innermost_decorator.__wrapped__ = new_target
+
+
 def unwrap(maybe_tf_decorator):
   """Unwraps an object into a list of TFDecorators and a final target.
 
@@ -163,6 +212,10 @@ class TFDecorator(object):
   def decorated_target(self):
     return self._decorated_target
 
+  @decorated_target.setter
+  def decorated_target(self, decorated_target):
+    self._decorated_target = decorated_target
+
   @property
   def decorator_name(self):
     return self._decorator_name
diff --git a/tensorflow/python/util/tf_decorator_test.py b/tensorflow/python/util/tf_decorator_test.py
index 0f9712c987d..9198f0b3fad 100644
--- a/tensorflow/python/util/tf_decorator_test.py
+++ b/tensorflow/python/util/tf_decorator_test.py
@@ -52,6 +52,22 @@ def test_decorator_increment_first_int_arg(target):
   return tf_decorator.make_decorator(target, wrapper)
 
 
+def test_injectable_decorator_square(target):
+
+  def wrapper(x):
+    return wrapper.__wrapped__(x)**2
+
+  return tf_decorator.make_decorator(target, wrapper)
+
+
+def test_injectable_decorator_increment(target):
+
+  def wrapper(x):
+    return wrapper.__wrapped__(x) + 1
+
+  return tf_decorator.make_decorator(target, wrapper)
+
+
 def test_function(x):
   """Test Function Docstring."""
   return x + 1
@@ -65,6 +81,12 @@ def test_decorated_function(x):
   return x * 2
 
 
+@test_injectable_decorator_square
+@test_injectable_decorator_increment
+def test_rewrappable_decorated(x):
+  return x * 2
+
+
 @test_tfdecorator('decorator')
 class TestDecoratedClass(object):
   """Test Decorated Class."""
@@ -215,6 +237,30 @@ class TfMakeDecoratorTest(test.TestCase):
     _ = tf_decorator.make_decorator(partial, test_wrapper)
 
 
+class TfDecoratorRewrapTest(test.TestCase):
+
+  def testRewrapMutatesAffectedFunction(self):
+
+    def new_target(x):
+      return x * 3
+
+    self.assertEqual((1 * 2 + 1) ** 2, test_rewrappable_decorated(1))
+    prev_target, _ = tf_decorator.unwrap(test_rewrappable_decorated)
+    tf_decorator.rewrap(test_rewrappable_decorated, prev_target, new_target)
+    self.assertEqual((1 * 3 + 1) ** 2, test_rewrappable_decorated(1))
+
+  def testRewrapOfDecoratorFunction(self):
+
+    def new_target(x):
+      return x * 3
+
+    prev_target = test_rewrappable_decorated._tf_decorator._decorated_target
+    # In this case, only the outer decorator (test_injectable_decorator_square)
+    # should be preserved.
+    tf_decorator.rewrap(test_rewrappable_decorated, prev_target, new_target)
+    self.assertEqual((1 * 3) ** 2, test_rewrappable_decorated(1))
+
+
 class TfDecoratorUnwrapTest(test.TestCase):
 
   def testUnwrapReturnsEmptyArrayForUndecoratedFunction(self):
diff --git a/tensorflow/python/util/tf_export.py b/tensorflow/python/util/tf_export.py
index a7a07babfe1..0924b36ade8 100644
--- a/tensorflow/python/util/tf_export.py
+++ b/tensorflow/python/util/tf_export.py
@@ -78,14 +78,16 @@ class SymbolAlreadyExposedError(Exception):
   pass
 
 
-def get_canonical_name_for_symbol(symbol, api_name=TENSORFLOW_API_NAME):
+def get_canonical_name_for_symbol(
+    symbol, api_name=TENSORFLOW_API_NAME,
+    add_prefix_to_v1_names=False):
   """Get canonical name for the API symbol.
 
-  Canonical name is the first non-deprecated endpoint name.
-
   Args:
     symbol: API function or class.
     api_name: API name (tensorflow or estimator).
+    add_prefix_to_v1_names: Specifies whether a name available only in V1
+      should be prefixed with compat.v1.
 
   Returns:
     Canonical name for the API symbol (for e.g. initializers.zeros) if
@@ -98,26 +100,42 @@ def get_canonical_name_for_symbol(symbol, api_name=TENSORFLOW_API_NAME):
   if api_names_attr not in undecorated_symbol.__dict__:
     return None
   api_names = getattr(undecorated_symbol, api_names_attr)
-  # TODO(annarev): may be add a separate deprecated attribute
-  # for estimator names.
   deprecated_api_names = undecorated_symbol.__dict__.get(
       '_tf_deprecated_api_names', [])
-  return get_canonical_name(api_names, deprecated_api_names)
+
+  canonical_name = get_canonical_name(api_names, deprecated_api_names)
+  if canonical_name:
+    return canonical_name
+
+  # If there is no V2 canonical name, get V1 canonical name.
+  api_names_attr = API_ATTRS_V1[api_name].names
+  api_names = getattr(undecorated_symbol, api_names_attr)
+  v1_canonical_name = get_canonical_name(api_names, deprecated_api_names)
+  if add_prefix_to_v1_names:
+    return 'compat.v1.%s' % v1_canonical_name
+  return v1_canonical_name
 
 
 def get_canonical_name(api_names, deprecated_api_names):
-  """Get first non-deprecated endpoint name.
+  """Get preferred endpoint name.
 
   Args:
     api_names: API names iterable.
     deprecated_api_names: Deprecated API names iterable.
   Returns:
-    Canonical name if there is at least one non-deprecated endpoint.
-    Otherwise returns None.
+    Returns one of the following in decreasing preference:
+    - first non-deprecated endpoint
+    - first endpoint
+    - None
   """
-  return next(
+  non_deprecated_name = next(
       (name for name in api_names if name not in deprecated_api_names),
       None)
+  if non_deprecated_name:
+    return non_deprecated_name
+  if api_names:
+    return api_names[0]
+  return None
 
 
 class api_export(object):  # pylint: disable=invalid-name
diff --git a/tensorflow/python/util/tf_inspect.py b/tensorflow/python/util/tf_inspect.py
index 444e44eaf14..5f1e776640d 100644
--- a/tensorflow/python/util/tf_inspect.py
+++ b/tensorflow/python/util/tf_inspect.py
@@ -352,6 +352,11 @@ def isfunction(object):  # pylint: disable=redefined-builtin
   return _inspect.isfunction(tf_decorator.unwrap(object)[1])
 
 
+def isframe(object):  # pylint: disable=redefined-builtin
+  """TFDecorator-aware replacement for inspect.ismodule."""
+  return _inspect.isframe(tf_decorator.unwrap(object)[1])
+
+
 def isgenerator(object):  # pylint: disable=redefined-builtin
   """TFDecorator-aware replacement for inspect.isgenerator."""
   return _inspect.isgenerator(tf_decorator.unwrap(object)[1])
diff --git a/tensorflow/stream_executor/BUILD b/tensorflow/stream_executor/BUILD
index d4d97087ba4..5c9d85acf4e 100644
--- a/tensorflow/stream_executor/BUILD
+++ b/tensorflow/stream_executor/BUILD
@@ -37,6 +37,10 @@ cc_library(
     deps = [
         "//tensorflow/core:lib",
         "//tensorflow/core:ptr_util",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/synchronization",
         "@local_config_cuda//cuda:cuda_headers",
     ],
     alwayslink = 1,
@@ -49,6 +53,7 @@ cc_library(
     deps = [
         "//tensorflow/core:lib",
         "//tensorflow/core:ptr_util",
+        "@com_google_absl//absl/strings",
         "@local_config_cuda//cuda:cuda_headers",
     ] + if_static([":stream_executor_impl"]),
 )
diff --git a/tensorflow/stream_executor/cuda/cuda_platform.cc b/tensorflow/stream_executor/cuda/cuda_platform.cc
index 622a4a4edb1..b342e71bdd9 100644
--- a/tensorflow/stream_executor/cuda/cuda_platform.cc
+++ b/tensorflow/stream_executor/cuda/cuda_platform.cc
@@ -209,3 +209,5 @@ REGISTER_MODULE_INITIALIZER(cuda_platform,
 // Note that module initialization sequencing is not supported in the
 // open-source project, so this will be a no-op there.
 REGISTER_MODULE_INITIALIZER_SEQUENCE(cuda_platform, multi_platform_manager);
+REGISTER_MODULE_INITIALIZER_SEQUENCE(multi_platform_manager_listener,
+                                     cuda_platform);
diff --git a/tensorflow/stream_executor/dnn.cc b/tensorflow/stream_executor/dnn.cc
index a38a6d52765..3d8e691ab28 100644
--- a/tensorflow/stream_executor/dnn.cc
+++ b/tensorflow/stream_executor/dnn.cc
@@ -448,7 +448,6 @@ ConvolutionDescriptor::ConvolutionDescriptor(int ndims)
     : zero_padding_(ndims, 0),
       filter_strides_(ndims, 1),
       dilation_rates_(ndims, 1),
-      pad_alignment_(PadAlignment::kDefault),
       group_count_(1),
       ndims_(ndims) {}
 
@@ -470,7 +469,7 @@ string ConvolutionDescriptor::ToString() const {
   return port::Printf(
       "{zero_padding: %s pad_alignment: %s filter_strides: %s dilation_rates: "
       "%s}",
-      padding.c_str(), PadAlignmentString(pad_alignment_).c_str(),
+      padding.c_str(), PadAlignmentString(pad_alignment()).c_str(),
       strides.c_str(), dilations.c_str());
 }
 
diff --git a/tensorflow/stream_executor/dnn.h b/tensorflow/stream_executor/dnn.h
index 558f3890da7..c934301829d 100644
--- a/tensorflow/stream_executor/dnn.h
+++ b/tensorflow/stream_executor/dnn.h
@@ -548,10 +548,6 @@ class ConvolutionDescriptor {
     SetDim(&dilation_rates_, dim, value);
     return *this;
   }
-  ConvolutionDescriptor& set_pad_alignment(PadAlignment pad_alignment) {
-    pad_alignment_ = pad_alignment;
-    return *this;
-  }
   ConvolutionDescriptor& set_group_count(int group_count) {
     group_count_ = group_count;
     return *this;
@@ -578,7 +574,9 @@ class ConvolutionDescriptor {
   int zero_padding(DimIndex dim) const { return GetDim(zero_padding_, dim); }
   int filter_stride(DimIndex dim) const { return GetDim(filter_strides_, dim); }
   int dilation_rate(DimIndex dim) const { return GetDim(dilation_rates_, dim); }
-  PadAlignment pad_alignment() const { return pad_alignment_; }
+  // TODO(timshen): remove this function. No users of this class is setting a
+  // non-default pad alignment.
+  PadAlignment pad_alignment() const { return PadAlignment::kDefault; }
   int group_count() const { return group_count_; }
   int ndims() const { return ndims_; }
 
@@ -591,7 +589,6 @@ class ConvolutionDescriptor {
   std::vector<int64> zero_padding_;
   std::vector<int64> filter_strides_;
   std::vector<int64> dilation_rates_;
-  PadAlignment pad_alignment_;
   int group_count_;
   int ndims_;
   // TODO(leary) cudnn provides these fields, but need to characterize what
@@ -921,6 +918,23 @@ class VersionInfo {
 // Suite of operations typically used for implementing Deep/Convolutional Neural
 // Nets. Note: A false return value of an operation indicates the
 // implementation is not available.
+//
+// TODO(b/118763918): this class (or rather dispatch table) has several
+// problems:
+// * Some overloads are missing. Ideally we want to have template virtual
+//   functions while the template arguments is a closed set. However, we don't
+//   get that from the language.
+// * The API is a union of cuDNN and another private backend. Only 10% of the
+//   functions are actually implemented by both backends, the rest are
+//   actually backend-specific. The massive interface creates extra mental
+//   burden.
+// * Poor error handling: the API should return Status objects.
+//
+// Things worth trying:
+// * Move functions that are not actually common back to the backends. Then,
+//   callers may use dynamic_cast to access specific backends. This may not be
+//   that hard, as many of the callers are Stream::ThenXxx functions.
+// * Change all the returned bools to Status.
 class DnnSupport {
  public:
   DnnSupport() {}
diff --git a/tensorflow/stream_executor/host/host_platform.cc b/tensorflow/stream_executor/host/host_platform.cc
index 410dc9da899..d16cca8dcc0 100644
--- a/tensorflow/stream_executor/host/host_platform.cc
+++ b/tensorflow/stream_executor/host/host_platform.cc
@@ -103,3 +103,5 @@ REGISTER_MODULE_INITIALIZER(host_platform,
 // Note that module initialization sequencing is not supported in the
 // open-source project, so this will be a no-op there.
 REGISTER_MODULE_INITIALIZER_SEQUENCE(host_platform, multi_platform_manager);
+REGISTER_MODULE_INITIALIZER_SEQUENCE(multi_platform_manager_listener,
+                                     host_platform);
diff --git a/tensorflow/stream_executor/multi_platform_manager.cc b/tensorflow/stream_executor/multi_platform_manager.cc
index 5b51398d8ca..bbb56071f49 100644
--- a/tensorflow/stream_executor/multi_platform_manager.cc
+++ b/tensorflow/stream_executor/multi_platform_manager.cc
@@ -15,62 +15,86 @@ limitations under the License.
 
 #include "tensorflow/stream_executor/multi_platform_manager.h"
 
+#include "absl/base/thread_annotations.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/strings/string_view.h"
+#include "absl/synchronization/mutex.h"
 #include "tensorflow/stream_executor/lib/error.h"
 #include "tensorflow/stream_executor/lib/initialize.h"
 #include "tensorflow/stream_executor/lib/str_util.h"
 #include "tensorflow/stream_executor/lib/stringprintf.h"
 
 namespace stream_executor {
+namespace {
 
-/* static */ mutex MultiPlatformManager::platforms_mutex_{LINKER_INITIALIZED};
+class MultiPlatformManagerImpl {
+ public:
+  port::Status RegisterPlatform(std::unique_ptr<Platform> platform)
+      LOCKS_EXCLUDED(mu_);
 
-/* static */ port::StatusOr<Platform*> MultiPlatformManager::LookupByNameLocked(
-    const string& target) {
-  PlatformMap* platform_map = GetPlatformMap();
-  auto it = platform_map->find(port::Lowercase(target));
-  if (it == platform_map->end()) {
-    return port::Status(
-        port::error::NOT_FOUND,
-        "could not find registered platform with name: \"" + target + "\"");
-  }
-  return it->second;
-}
+  port::StatusOr<Platform*> PlatformWithName(absl::string_view target)
+      LOCKS_EXCLUDED(mu_);
 
-/* static */ port::StatusOr<Platform*> MultiPlatformManager::LookupByIdLocked(
-    const Platform::Id& id) {
-  PlatformIdMap* platform_map = GetPlatformByIdMap();
-  auto it = platform_map->find(id);
-  if (it == platform_map->end()) {
-    return port::Status(
-        port::error::NOT_FOUND,
-        port::Printf("could not find registered platform with id: 0x%p", id));
-  }
-  return it->second;
-}
+  port::StatusOr<Platform*> PlatformWithId(const Platform::Id& id)
+      LOCKS_EXCLUDED(mu_);
 
-/* static */ port::Status MultiPlatformManager::RegisterPlatform(
+  port::StatusOr<Platform*> InitializePlatformWithName(
+      absl::string_view target, const std::map<string, string>& options)
+      LOCKS_EXCLUDED(mu_);
+  port::StatusOr<Platform*> InitializePlatformWithId(
+      const Platform::Id& id, const std::map<string, string>& options)
+      LOCKS_EXCLUDED(mu_);
+
+  std::vector<Platform*> AllPlatforms() LOCKS_EXCLUDED(mu_);
+
+  using Listener = MultiPlatformManager::Listener;
+  port::Status RegisterListener(std::unique_ptr<Listener> listener)
+      LOCKS_EXCLUDED(mu_);
+
+ private:
+  // Looks up the platform object with the given name.  Assumes the Platforms
+  // mutex is held.
+  port::StatusOr<Platform*> LookupByNameLocked(absl::string_view target)
+      EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  // Looks up the platform object with the given id.  Assumes the Platforms
+  // mutex is held.
+  port::StatusOr<Platform*> LookupByIdLocked(const Platform::Id& id)
+      EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  absl::Mutex mu_;
+  std::vector<std::unique_ptr<Listener>> listeners_ GUARDED_BY(mu_);
+  absl::flat_hash_map<Platform::Id, Platform*> id_map_ GUARDED_BY(mu_);
+  absl::flat_hash_map<string, Platform*> name_map_ GUARDED_BY(mu_);
+};
+
+port::Status MultiPlatformManagerImpl::RegisterPlatform(
     std::unique_ptr<Platform> platform) {
   CHECK(platform != nullptr);
   string key = port::Lowercase(platform->Name());
-  mutex_lock lock(platforms_mutex_);
-  if (GetPlatformMap()->find(key) != GetPlatformMap()->end()) {
+  absl::MutexLock lock(&mu_);
+  if (name_map_.find(key) != name_map_.end()) {
     return port::Status(port::error::INTERNAL,
                         "platform is already registered with name: \"" +
                             platform->Name() + "\"");
   }
-  GetPlatformByIdMap()->insert(std::make_pair(platform->id(), platform.get()));
+  Platform* platform_ptr = platform.get();
+  CHECK(id_map_.emplace(platform->id(), platform_ptr).second);
   // Release ownership/uniqueness to prevent destruction on program exit.
   // This avoids Platforms "cleaning up" on program exit, because otherwise,
   // there are _very_ tricky races between StreamExecutor and underlying
   // platforms (CUDA, OpenCL) during exit. Since these are fixed-size and 1x per
   // program, these are deemed acceptable.
-  (*GetPlatformMap())[key] = platform.release();
+  name_map_[key] = platform.release();
+  for (const auto& listener : listeners_) {
+    listener->PlatformRegistered(platform_ptr);
+  }
   return port::Status::OK();
 }
 
-/* static */ port::StatusOr<Platform*> MultiPlatformManager::PlatformWithName(
-    const string& target) {
-  mutex_lock lock(platforms_mutex_);
+port::StatusOr<Platform*> MultiPlatformManagerImpl::PlatformWithName(
+    absl::string_view target) {
+  absl::MutexLock lock(&mu_);
 
   SE_ASSIGN_OR_RETURN(Platform * platform, LookupByNameLocked(target));
   if (!platform->Initialized()) {
@@ -80,9 +104,9 @@ namespace stream_executor {
   return platform;
 }
 
-/* static */ port::StatusOr<Platform*> MultiPlatformManager::PlatformWithId(
+port::StatusOr<Platform*> MultiPlatformManagerImpl::PlatformWithId(
     const Platform::Id& id) {
-  mutex_lock lock(platforms_mutex_);
+  absl::MutexLock lock(&mu_);
 
   SE_ASSIGN_OR_RETURN(Platform * platform, LookupByIdLocked(id));
   if (!platform->Initialized()) {
@@ -92,15 +116,15 @@ namespace stream_executor {
   return platform;
 }
 
-/* static */ port::StatusOr<Platform*>
-MultiPlatformManager::InitializePlatformWithName(
-    const string& target, const std::map<string, string>& options) {
-  mutex_lock lock(platforms_mutex_);
+port::StatusOr<Platform*> MultiPlatformManagerImpl::InitializePlatformWithName(
+    absl::string_view target, const std::map<string, string>& options) {
+  absl::MutexLock lock(&mu_);
 
   SE_ASSIGN_OR_RETURN(Platform * platform, LookupByNameLocked(target));
   if (platform->Initialized()) {
-    return port::Status(port::error::FAILED_PRECONDITION,
-                        "platform \"" + target + "\" is already initialized");
+    return port::Status(
+        port::error::FAILED_PRECONDITION,
+        absl::StrCat("platform \"", target, "\" is already initialized"));
   }
 
   SE_RETURN_IF_ERROR(platform->Initialize(options));
@@ -108,10 +132,9 @@ MultiPlatformManager::InitializePlatformWithName(
   return platform;
 }
 
-/* static */ port::StatusOr<Platform*>
-MultiPlatformManager::InitializePlatformWithId(
+port::StatusOr<Platform*> MultiPlatformManagerImpl::InitializePlatformWithId(
     const Platform::Id& id, const std::map<string, string>& options) {
-  mutex_lock lock(platforms_mutex_);
+  absl::MutexLock lock(&mu_);
 
   SE_ASSIGN_OR_RETURN(Platform * platform, LookupByIdLocked(id));
   if (platform->Initialized()) {
@@ -125,10 +148,90 @@ MultiPlatformManager::InitializePlatformWithId(
   return platform;
 }
 
-/* static */ void MultiPlatformManager::ClearPlatformRegistry() {
-  mutex_lock lock(platforms_mutex_);
-  GetPlatformMap()->clear();
-  GetPlatformByIdMap()->clear();
+port::Status MultiPlatformManagerImpl::RegisterListener(
+    std::unique_ptr<Listener> listener) {
+  absl::MutexLock lock(&mu_);
+  CHECK(id_map_.empty());
+  CHECK(name_map_.empty());
+  listeners_.push_back(std::move(listener));
+  return port::Status::OK();
+}
+
+std::vector<Platform*> MultiPlatformManagerImpl::AllPlatforms() {
+  absl::MutexLock lock(&mu_);
+  CHECK_EQ(id_map_.size(), name_map_.size());
+  std::vector<Platform*> platforms;
+  platforms.reserve(id_map_.size());
+  for (const auto& entry : id_map_) {
+    platforms.push_back(entry.second);
+  }
+  return platforms;
+}
+
+port::StatusOr<Platform*> MultiPlatformManagerImpl::LookupByNameLocked(
+    absl::string_view target) {
+  auto it = name_map_.find(port::Lowercase(target));
+  if (it == name_map_.end()) {
+    return port::Status(
+        port::error::NOT_FOUND,
+        absl::StrCat("Could not find registered platform with name: \"", target,
+                     "\""));
+  }
+  return it->second;
+}
+
+port::StatusOr<Platform*> MultiPlatformManagerImpl::LookupByIdLocked(
+    const Platform::Id& id) {
+  auto it = id_map_.find(id);
+  if (it == id_map_.end()) {
+    return port::Status(
+        port::error::NOT_FOUND,
+        port::Printf("could not find registered platform with id: 0x%p", id));
+  }
+  return it->second;
+}
+
+MultiPlatformManagerImpl& Impl() {
+  static MultiPlatformManagerImpl* impl = new MultiPlatformManagerImpl;
+  return *impl;
+}
+
+}  // namespace
+
+/*static*/ port::Status MultiPlatformManager::RegisterPlatform(
+    std::unique_ptr<Platform> platform) {
+  return Impl().RegisterPlatform(std::move(platform));
+}
+
+/*static*/ port::StatusOr<Platform*> MultiPlatformManager::PlatformWithName(
+    absl::string_view target) {
+  return Impl().PlatformWithName(target);
+}
+
+/*static*/ port::StatusOr<Platform*> MultiPlatformManager::PlatformWithId(
+    const Platform::Id& id) {
+  return Impl().PlatformWithId(id);
+}
+
+/*static*/ port::StatusOr<Platform*>
+MultiPlatformManager::InitializePlatformWithName(
+    absl::string_view target, const std::map<string, string>& options) {
+  return Impl().InitializePlatformWithName(target, options);
+}
+
+/*static*/ port::StatusOr<Platform*>
+MultiPlatformManager::InitializePlatformWithId(
+    const Platform::Id& id, const std::map<string, string>& options) {
+  return Impl().InitializePlatformWithId(id, options);
+}
+
+/*static*/ port::Status MultiPlatformManager::RegisterListener(
+    std::unique_ptr<Listener> listener) {
+  return Impl().RegisterListener(std::move(listener));
+}
+
+/*static*/ std::vector<Platform*> MultiPlatformManager::AllPlatforms() {
+  return Impl().AllPlatforms();
 }
 
 }  // namespace stream_executor
@@ -141,3 +244,15 @@ REGISTER_MODULE_INITIALIZER(
         // purposes from Platform subclasses that register
         // themselves with the MultiPlatformManager.
     });
+
+REGISTER_MODULE_INITIALIZER(
+    multi_platform_manager_listener,
+    {
+        // Nothing -- this is just a module initializer definition to reference
+        // for sequencing registration of listeners with the
+        // MultiPlatformManager.
+    });
+
+// Listener registration should happen before platform registration.
+REGISTER_MODULE_INITIALIZER_SEQUENCE(multi_platform_manager_listener,
+                                     multi_platform_manager);
diff --git a/tensorflow/stream_executor/multi_platform_manager.h b/tensorflow/stream_executor/multi_platform_manager.h
index 146a128e85c..06f5ae2c2ba 100644
--- a/tensorflow/stream_executor/multi_platform_manager.h
+++ b/tensorflow/stream_executor/multi_platform_manager.h
@@ -67,14 +67,14 @@ limitations under the License.
 #include <functional>
 #include <map>
 #include <memory>
+#include <vector>
 
+#include "absl/strings/string_view.h"
 #include "tensorflow/stream_executor/lib/initialize.h"
 #include "tensorflow/stream_executor/lib/status.h"
 #include "tensorflow/stream_executor/lib/statusor.h"
 #include "tensorflow/stream_executor/platform.h"
-#include "tensorflow/stream_executor/platform/mutex.h"
 #include "tensorflow/stream_executor/platform/port.h"
-#include "tensorflow/stream_executor/platform/thread_annotations.h"
 
 namespace stream_executor {
 
@@ -84,9 +84,8 @@ class MultiPlatformManager {
   // Registers a platform object, returns an error status if the platform is
   // already registered. The associated listener, if not null, will be used to
   // trace events for ALL executors for that platform.
-  // Takes ownership of listener.
-  static port::Status RegisterPlatform(std::unique_ptr<Platform> platform)
-      LOCKS_EXCLUDED(platforms_mutex_);
+  // Takes ownership of platform.
+  static port::Status RegisterPlatform(std::unique_ptr<Platform> platform);
 
   // Retrieves the platform registered with the given platform name (e.g.
   // "CUDA", "OpenCL", ...) or id (an opaque, comparable value provided by the
@@ -98,10 +97,8 @@ class MultiPlatformManager {
   // If the requested platform is not registered, an error status is returned.
   // Ownership of the platform is NOT transferred to the caller --
   // the MultiPlatformManager owns the platforms in a singleton-like fashion.
-  static port::StatusOr<Platform*> PlatformWithName(const string& target)
-      LOCKS_EXCLUDED(platforms_mutex_);
-  static port::StatusOr<Platform*> PlatformWithId(const Platform::Id& id)
-      LOCKS_EXCLUDED(platforms_mutex_);
+  static port::StatusOr<Platform*> PlatformWithName(absl::string_view target);
+  static port::StatusOr<Platform*> PlatformWithId(const Platform::Id& id);
 
   // Retrieves the platform registered with the given platform name (e.g.
   // "CUDA", "OpenCL", ...) or id (an opaque, comparable value provided by the
@@ -114,14 +111,12 @@ class MultiPlatformManager {
   // Ownership of the platform is NOT transferred to the caller --
   // the MultiPlatformManager owns the platforms in a singleton-like fashion.
   static port::StatusOr<Platform*> InitializePlatformWithName(
-      const string& target, const std::map<string, string>& options)
-      LOCKS_EXCLUDED(platforms_mutex_);
-  static port::StatusOr<Platform*> InitializePlatformWithId(
-      const Platform::Id& id, const std::map<string, string>& options)
-      LOCKS_EXCLUDED(platforms_mutex_);
+      absl::string_view target, const std::map<string, string>& options);
 
-  // Clears the set of registered platforms, primarily used for testing.
-  static void ClearPlatformRegistry() LOCKS_EXCLUDED(platforms_mutex_);
+  static port::StatusOr<Platform*> InitializePlatformWithId(
+      const Platform::Id& id, const std::map<string, string>& options);
+
+  static std::vector<Platform*> AllPlatforms();
 
   // Although the MultiPlatformManager "owns" its platforms, it holds them as
   // undecorated pointers to prevent races during program exit (between this
@@ -135,57 +130,32 @@ class MultiPlatformManager {
   // of any platforms registered with it, and leak checking should be disabled
   // during allocation of such Platforms, to avoid spurious reporting at program
   // exit.
-  using PlatformMap = std::map<string, Platform*>;
 
-  // Provides access to the available set of platforms under a lock.
-  static port::Status WithPlatforms(
-      std::function<port::Status(PlatformMap*)> callback)
-      LOCKS_EXCLUDED(platforms_mutex_) {
-    mutex_lock lock(platforms_mutex_);
-    return callback(GetPlatformMap());
-  }
-
- private:
-  using PlatformIdMap = std::map<Platform::Id, Platform*>;
-
-  static mutex platforms_mutex_;
-
-  // TODO(b/22689637): Clean up these two maps; make sure they coexist nicely.
-  // TODO(b/22689637): Move this (whatever the final/"official" map is) to
-  // plugin_regstry.h, along with the associated functionality.
-  // Platform-name-to-object mapping. These platforms are registered via module
-  // initializers, and linkage determines which platforms are available to a
-  // given target.
-  static PlatformMap* GetPlatformMap() {
-    static PlatformMap* instance = new PlatformMap;
-    return instance;
-  }
-
-  // Holds a Platform::Id-to-object mapping.
-  // Unlike platforms_ above, this map does not own its contents.
-  static PlatformIdMap* GetPlatformByIdMap() {
-    static PlatformIdMap* instance = new PlatformIdMap;
-    return instance;
-  }
-
-  // Looks up the platform object with the given name.  Assumes the Platforms
-  // mutex is held.
-  static port::StatusOr<Platform*> LookupByNameLocked(const string& target)
-      EXCLUSIVE_LOCKS_REQUIRED(platforms_mutex_);
-
-  // Looks up the platform object with the given id.  Assumes the Platforms
-  // mutex is held.
-  static port::StatusOr<Platform*> LookupByIdLocked(const Platform::Id& id)
-      EXCLUSIVE_LOCKS_REQUIRED(platforms_mutex_);
-
-  SE_DISALLOW_COPY_AND_ASSIGN(MultiPlatformManager);
+  // Interface for a listener that gets notfied at certain events.
+  class Listener {
+   public:
+    virtual ~Listener() = default;
+    // Callback that is invoked when a Platform is registered.
+    virtual void PlatformRegistered(Platform* platform) = 0;
+  };
+  // Registers a listeners to receive notifications about certain events.
+  // Precondition: No Platform has been registered yet.
+  static port::Status RegisterListener(std::unique_ptr<Listener> listener);
 };
 
 }  // namespace stream_executor
 
-// multi_platform_manager.cc will define this instance. Includers of this header
-// should use
+// multi_platform_manager.cc will define these instances.
+//
+// Registering a platform:
 // REGISTER_MODULE_INITIALIZER_SEQUENCE(my_platform, multi_platform_manager);
+// REGISTER_MODULE_INITIALIZER_SEQUENCE(multi_platform_manager_listener,
+// my_platform);
+//
+// Registering a listener:
+// REGISTER_MODULE_INITIALIZER_SEQUENCE(my_listener,
+// multi_platform_manager_listener);
 DECLARE_MODULE_INITIALIZER(multi_platform_manager);
+DECLARE_MODULE_INITIALIZER(multi_platform_manager_listener);
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_MULTI_PLATFORM_MANAGER_H_
diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index 74773629d29..8e5ab94b536 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@@ -1167,6 +1167,11 @@ def tf_kernel_library(
         copts = []
     textual_hdrs = []
     copts = copts + tf_copts(is_external = is_external)
+
+    # Override EIGEN_STRONG_INLINE to inline when
+    # --define=override_eigen_strong_inline=true to avoid long compiling time.
+    # See https://github.com/tensorflow/tensorflow/issues/10521
+    copts = copts + if_override_eigen_strong_inline(["/DEIGEN_STRONG_INLINE=inline"])
     if prefix:
         if native.glob([prefix + "*.cu.cc"], exclude = ["*test*"]):
             if not gpu_srcs:
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.-fixed-length-record-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.-fixed-length-record-dataset.pbtxt
index a7bfa82c650..81358cecbc0 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.-fixed-length-record-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.-fixed-length-record-dataset.pbtxt
@@ -1,6 +1,7 @@
 path: "tensorflow.data.FixedLengthRecordDataset"
 tf_class {
   is_instance: "<class \'tensorflow.python.data.ops.readers.FixedLengthRecordDataset\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetSource\'>"
   is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.Dataset\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -17,7 +18,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'filenames\', \'record_bytes\', \'header_bytes\', \'footer_bytes\', \'buffer_size\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'filenames\', \'record_bytes\', \'header_bytes\', \'footer_bytes\', \'buffer_size\', \'compression_type\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.-options.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.-options.pbtxt
index 9f4de74c393..9d032d43de1 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.-options.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.-options.pbtxt
@@ -18,10 +18,6 @@ tf_class {
     name: "experimental_hoist_random_uniform"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "experimental_latency_all_edges"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "experimental_map_and_batch_fusion"
     mtype: "<type \'property\'>"
@@ -54,6 +50,10 @@ tf_class {
     name: "experimental_shuffle_and_repeat_fusion"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "experimental_stats"
+    mtype: "<type \'property\'>"
+  }
   member_method {
     name: "__init__"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.-text-line-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.-text-line-dataset.pbtxt
index 2817f900e15..1c305abf68c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.-text-line-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.-text-line-dataset.pbtxt
@@ -1,6 +1,7 @@
 path: "tensorflow.data.TextLineDataset"
 tf_class {
   is_instance: "<class \'tensorflow.python.data.ops.readers.TextLineDataset\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetSource\'>"
   is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.Dataset\'>"
   is_instance: "<type \'object\'>"
   member {
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-stats-aggregator.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-stats-aggregator.pbtxt
index 0bcc8cf3e87..6536a698b50 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-stats-aggregator.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-stats-aggregator.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.data.experimental.StatsAggregator"
 tf_class {
-  is_instance: "<class \'tensorflow.python.data.experimental.ops.stats_ops.StatsAggregator\'>"
+  is_instance: "<class \'tensorflow.python.data.experimental.ops.stats_aggregator.StatsAggregator\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-stats-options.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-stats-options.pbtxt
new file mode 100644
index 00000000000..f423eed42cc
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-stats-options.pbtxt
@@ -0,0 +1,25 @@
+path: "tensorflow.data.experimental.StatsOptions"
+tf_class {
+  is_instance: "<class \'tensorflow.python.data.experimental.ops.stats_options.StatsOptions\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "aggregator"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "counter_prefix"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "latency_all_edges"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "prefix"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'aggregator\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.pbtxt
index 116684e5d81..4c253bb8adf 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.pbtxt
@@ -32,6 +32,10 @@ tf_module {
     name: "StatsAggregator"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "StatsOptions"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "TFRecordWriter"
     mtype: "<type \'type\'>"
@@ -124,10 +128,6 @@ tf_module {
     name: "scan"
     argspec: "args=[\'initial_state\', \'scan_func\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "set_stats_aggregator"
-    argspec: "args=[\'stats_aggregator\', \'tag\', \'counter_prefix\'], varargs=None, keywords=None, defaults=[\'\', \'\'], "
-  }
   member_method {
     name: "shuffle_and_repeat"
     argspec: "args=[\'buffer_size\', \'count\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-boosted-trees-classifier.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-boosted-trees-classifier.pbtxt
index fa352907c0b..d530c71482a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-boosted-trees-classifier.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-boosted-trees-classifier.pbtxt
@@ -22,7 +22,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'feature_columns\', \'n_batches_per_layer\', \'model_dir\', \'n_classes\', \'weight_column\', \'label_vocabulary\', \'n_trees\', \'max_depth\', \'learning_rate\', \'l1_regularization\', \'l2_regularization\', \'tree_complexity\', \'min_node_weight\', \'config\', \'center_bias\', \'pruning_mode\'], varargs=None, keywords=None, defaults=[\'None\', \'<object object instance>\', \'None\', \'None\', \'100\', \'6\', \'0.1\', \'0.0\', \'0.0\', \'0.0\', \'0.0\', \'None\', \'False\', \'none\'], "
+    argspec: "args=[\'self\', \'feature_columns\', \'n_batches_per_layer\', \'model_dir\', \'n_classes\', \'weight_column\', \'label_vocabulary\', \'n_trees\', \'max_depth\', \'learning_rate\', \'l1_regularization\', \'l2_regularization\', \'tree_complexity\', \'min_node_weight\', \'config\', \'center_bias\', \'pruning_mode\', \'quantile_sketch_epsilon\'], varargs=None, keywords=None, defaults=[\'None\', \'<object object instance>\', \'None\', \'None\', \'100\', \'6\', \'0.1\', \'0.0\', \'0.0\', \'0.0\', \'0.0\', \'None\', \'False\', \'none\', \'0.01\'], "
   }
   member_method {
     name: "eval_dir"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-boosted-trees-regressor.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-boosted-trees-regressor.pbtxt
index 154b35f3064..4703c0f561a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-boosted-trees-regressor.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-boosted-trees-regressor.pbtxt
@@ -22,7 +22,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'feature_columns\', \'n_batches_per_layer\', \'model_dir\', \'label_dimension\', \'weight_column\', \'n_trees\', \'max_depth\', \'learning_rate\', \'l1_regularization\', \'l2_regularization\', \'tree_complexity\', \'min_node_weight\', \'config\', \'center_bias\', \'pruning_mode\'], varargs=None, keywords=None, defaults=[\'None\', \'<object object instance>\', \'None\', \'100\', \'6\', \'0.1\', \'0.0\', \'0.0\', \'0.0\', \'0.0\', \'None\', \'False\', \'none\'], "
+    argspec: "args=[\'self\', \'feature_columns\', \'n_batches_per_layer\', \'model_dir\', \'label_dimension\', \'weight_column\', \'n_trees\', \'max_depth\', \'learning_rate\', \'l1_regularization\', \'l2_regularization\', \'tree_complexity\', \'min_node_weight\', \'config\', \'center_bias\', \'pruning_mode\', \'quantile_sketch_epsilon\'], varargs=None, keywords=None, defaults=[\'None\', \'<object object instance>\', \'None\', \'100\', \'6\', \'0.1\', \'0.0\', \'0.0\', \'0.0\', \'0.0\', \'None\', \'False\', \'none\', \'0.01\'], "
   }
   member_method {
     name: "eval_dir"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-d-n-n-estimator.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-d-n-n-estimator.pbtxt
new file mode 100644
index 00000000000..4635a1544c3
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-d-n-n-estimator.pbtxt
@@ -0,0 +1,62 @@
+path: "tensorflow.estimator.DNNEstimator"
+tf_class {
+  is_instance: "<class \'tensorflow_estimator.python.estimator.canned.dnn.DNNEstimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.Estimator\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "config"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "model_dir"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "model_fn"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "params"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'head\', \'hidden_units\', \'feature_columns\', \'model_dir\', \'optimizer\', \'activation_fn\', \'dropout\', \'input_layer_partitioner\', \'config\', \'warm_start_from\', \'batch_norm\'], varargs=None, keywords=None, defaults=[\'None\', \'Adagrad\', \'<function relu instance>\', \'None\', \'None\', \'None\', \'None\', \'False\'], "
+  }
+  member_method {
+    name: "eval_dir"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "evaluate"
+    argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "export_saved_model"
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "export_savedmodel"
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'strip_default_attrs\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'False\'], "
+  }
+  member_method {
+    name: "get_variable_names"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_variable_value"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "latest_checkpoint"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "predict"
+    argspec: "args=[\'self\', \'input_fn\', \'predict_keys\', \'hooks\', \'checkpoint_path\', \'yield_single_examples\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\'], "
+  }
+  member_method {
+    name: "train"
+    argspec: "args=[\'self\', \'input_fn\', \'hooks\', \'steps\', \'max_steps\', \'saving_listeners\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-linear-estimator.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-linear-estimator.pbtxt
new file mode 100644
index 00000000000..3d6b03098aa
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-linear-estimator.pbtxt
@@ -0,0 +1,62 @@
+path: "tensorflow.estimator.LinearEstimator"
+tf_class {
+  is_instance: "<class \'tensorflow_estimator.python.estimator.canned.linear.LinearEstimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.Estimator\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "config"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "model_dir"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "model_fn"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "params"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'head\', \'feature_columns\', \'model_dir\', \'optimizer\', \'config\', \'partitioner\', \'sparse_combiner\'], varargs=None, keywords=None, defaults=[\'None\', \'Ftrl\', \'None\', \'None\', \'sum\'], "
+  }
+  member_method {
+    name: "eval_dir"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "evaluate"
+    argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "export_saved_model"
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "export_savedmodel"
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'strip_default_attrs\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'False\'], "
+  }
+  member_method {
+    name: "get_variable_names"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_variable_value"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "latest_checkpoint"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "predict"
+    argspec: "args=[\'self\', \'input_fn\', \'predict_keys\', \'hooks\', \'checkpoint_path\', \'yield_single_examples\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\'], "
+  }
+  member_method {
+    name: "train"
+    argspec: "args=[\'self\', \'input_fn\', \'hooks\', \'steps\', \'max_steps\', \'saving_listeners\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.experimental.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.experimental.pbtxt
index 862761a96c7..cabca3e883f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.estimator.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.experimental.pbtxt
@@ -16,4 +16,12 @@ tf_module {
     name: "linear_logit_fn_builder"
     argspec: "args=[\'units\', \'feature_columns\', \'sparse_combiner\'], varargs=None, keywords=None, defaults=[\'sum\'], "
   }
+  member_method {
+    name: "make_early_stopping_hook"
+    argspec: "args=[\'estimator\', \'should_stop_fn\', \'run_every_secs\', \'run_every_steps\'], varargs=None, keywords=None, defaults=[\'60\', \'None\'], "
+  }
+  member_method {
+    name: "stop_if_higher_hook"
+    argspec: "args=[\'estimator\', \'metric_name\', \'threshold\', \'eval_dir\', \'min_steps\', \'run_every_secs\', \'run_every_steps\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'60\', \'None\'], "
+  }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.pbtxt
index ec3216ae705..c5b0085b8d3 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.estimator.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.pbtxt
@@ -28,6 +28,10 @@ tf_module {
     name: "DNNClassifier"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "DNNEstimator"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "DNNLinearCombinedClassifier"
     mtype: "<type \'type\'>"
@@ -72,6 +76,10 @@ tf_module {
     name: "LinearClassifier"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "LinearEstimator"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "LinearRegressor"
     mtype: "<type \'type\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.experimental.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.experimental.pbtxt
new file mode 100644
index 00000000000..0c3f04e468c
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.experimental.pbtxt
@@ -0,0 +1,7 @@
+path: "tensorflow.experimental"
+tf_module {
+  member_method {
+    name: "function_executor_type"
+    argspec: "args=[\'executor_type\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.io.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.io.pbtxt
index dccf136788d..64b63ed1a4a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.io.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.io.pbtxt
@@ -112,6 +112,10 @@ tf_module {
     name: "serialize_sparse"
     argspec: "args=[\'sp_input\', \'name\', \'out_type\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'string\'>\"], "
   }
+  member_method {
+    name: "serialize_tensor"
+    argspec: "args=[\'tensor\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "tf_record_iterator"
     argspec: "args=[\'path\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.-model.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.-model.pbtxt
index 4eb7f8f2a71..8ccba990bdd 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.-model.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.-model.pbtxt
@@ -89,10 +89,6 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "uses_learning_phase"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.-sequential.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.-sequential.pbtxt
index 524455be8bb..27aa91a6452 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.-sequential.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.-sequential.pbtxt
@@ -90,10 +90,6 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "uses_learning_phase"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.backend.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.backend.pbtxt
index 5f0dfd7ae7d..b0e5d2bde7d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.backend.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.backend.pbtxt
@@ -182,7 +182,7 @@ tf_module {
   }
   member_method {
     name: "function"
-    argspec: "args=[\'inputs\', \'outputs\', \'updates\'], varargs=None, keywords=kwargs, defaults=[\'None\'], "
+    argspec: "args=[\'inputs\', \'outputs\', \'updates\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "gather"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-base-logger.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-base-logger.pbtxt
index 9eee9b37896..7d298e95135 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-base-logger.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-base-logger.pbtxt
@@ -23,6 +23,14 @@ tf_class {
     name: "on_epoch_end"
     argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "on_train_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_train_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "on_train_begin"
     argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-c-s-v-logger.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-c-s-v-logger.pbtxt
index 5bb949c5bb6..133205ab88b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-c-s-v-logger.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-c-s-v-logger.pbtxt
@@ -23,6 +23,14 @@ tf_class {
     name: "on_epoch_end"
     argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "on_train_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_train_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "on_train_begin"
     argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-callback.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-callback.pbtxt
index a5340d52c1a..d766c09ac5e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-callback.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-callback.pbtxt
@@ -22,6 +22,14 @@ tf_class {
     name: "on_epoch_end"
     argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "on_train_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_train_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "on_train_begin"
     argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-early-stopping.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-early-stopping.pbtxt
index ed0f37647f4..605f74e5602 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-early-stopping.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-early-stopping.pbtxt
@@ -27,6 +27,14 @@ tf_class {
     name: "on_epoch_end"
     argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "on_train_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_train_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "on_train_begin"
     argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-history.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-history.pbtxt
index ee400b31c43..cd893e67269 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-history.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-history.pbtxt
@@ -23,6 +23,14 @@ tf_class {
     name: "on_epoch_end"
     argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "on_train_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_train_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "on_train_begin"
     argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-lambda-callback.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-lambda-callback.pbtxt
index df8d7b0ef7a..50f2054cabb 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-lambda-callback.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-lambda-callback.pbtxt
@@ -23,6 +23,14 @@ tf_class {
     name: "on_epoch_end"
     argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "on_train_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_train_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "on_train_begin"
     argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-learning-rate-scheduler.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-learning-rate-scheduler.pbtxt
index ce1a9b694d8..9ed9db0a89b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-learning-rate-scheduler.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-learning-rate-scheduler.pbtxt
@@ -23,6 +23,14 @@ tf_class {
     name: "on_epoch_end"
     argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "on_train_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_train_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "on_train_begin"
     argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-model-checkpoint.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-model-checkpoint.pbtxt
index 48bb24a0527..3d8d1363bb4 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-model-checkpoint.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-model-checkpoint.pbtxt
@@ -23,6 +23,14 @@ tf_class {
     name: "on_epoch_end"
     argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "on_train_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_train_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "on_train_begin"
     argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-progbar-logger.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-progbar-logger.pbtxt
index d8bb8b2a7d0..5012f1517d5 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-progbar-logger.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-progbar-logger.pbtxt
@@ -23,6 +23,14 @@ tf_class {
     name: "on_epoch_end"
     argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "on_train_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_train_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "on_train_begin"
     argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-reduce-l-r-on-plateau.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-reduce-l-r-on-plateau.pbtxt
index dc27af9552a..73652c2b612 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-reduce-l-r-on-plateau.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-reduce-l-r-on-plateau.pbtxt
@@ -27,6 +27,14 @@ tf_class {
     name: "on_epoch_end"
     argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "on_train_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_train_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "on_train_begin"
     argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-remote-monitor.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-remote-monitor.pbtxt
index 5a3b791c0ad..24db71de118 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-remote-monitor.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-remote-monitor.pbtxt
@@ -23,6 +23,14 @@ tf_class {
     name: "on_epoch_end"
     argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "on_train_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_train_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "on_train_begin"
     argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-tensor-board.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-tensor-board.pbtxt
index e9d53b7225a..c5503c69a5f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-tensor-board.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-tensor-board.pbtxt
@@ -23,6 +23,14 @@ tf_class {
     name: "on_epoch_end"
     argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "on_train_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_train_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "on_train_begin"
     argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-terminate-on-na-n.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-terminate-on-na-n.pbtxt
index 5c2d336353a..de6e8ef0725 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-terminate-on-na-n.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-terminate-on-na-n.pbtxt
@@ -23,6 +23,14 @@ tf_class {
     name: "on_epoch_end"
     argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "on_train_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_train_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "on_train_begin"
     argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-model.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-model.pbtxt
index dbd88939b7f..ccff809f2b2 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-model.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-model.pbtxt
@@ -89,10 +89,6 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "uses_learning_phase"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-sequential.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-sequential.pbtxt
index 3c36f0702f8..b0fc7f97f1d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-sequential.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-sequential.pbtxt
@@ -90,10 +90,6 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "uses_learning_phase"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.pbtxt
index cbab7ce6314..1a4098d121b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.pbtxt
@@ -136,6 +136,10 @@ tf_module {
     name: "matmul"
     argspec: "args=[\'a\', \'b\', \'transpose_a\', \'transpose_b\', \'adjoint_a\', \'adjoint_b\', \'a_is_sparse\', \'b_is_sparse\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'False\', \'False\', \'False\', \'False\', \'None\'], "
   }
+  member_method {
+    name: "matvec"
+    argspec: "args=[\'a\', \'b\', \'transpose_a\', \'adjoint_a\', \'a_is_sparse\', \'b_is_sparse\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'False\', \'False\', \'None\'], "
+  }
   member_method {
     name: "norm"
     argspec: "args=[\'tensor\', \'ord\', \'axis\', \'keepdims\', \'name\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'euclidean\', \'None\', \'None\', \'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.math.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.math.pbtxt
index a45c0a3259e..a8334fdd1d1 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.math.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.math.pbtxt
@@ -296,10 +296,18 @@ tf_module {
     name: "reduce_prod"
     argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\', \'reduction_indices\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
+  member_method {
+    name: "reduce_std"
+    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
   member_method {
     name: "reduce_sum"
     argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\', \'reduction_indices\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
+  member_method {
+    name: "reduce_variance"
+    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
   member_method {
     name: "rint"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.pbtxt
index c9eb136f54e..f4dce81659d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.pbtxt
@@ -340,6 +340,10 @@ tf_module {
     name: "estimator"
     mtype: "<type \'module\'>"
   }
+  member {
+    name: "experimental"
+    mtype: "<type \'module\'>"
+  }
   member {
     name: "feature_column"
     mtype: "<type \'module\'>"
@@ -528,6 +532,10 @@ tf_module {
     name: "sets"
     mtype: "<type \'module\'>"
   }
+  member {
+    name: "signal"
+    mtype: "<type \'module\'>"
+  }
   member {
     name: "sparse"
     mtype: "<type \'module\'>"
@@ -1556,6 +1564,10 @@ tf_module {
     name: "negative"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "no_gradient"
+    argspec: "args=[\'op_type\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "no_op"
     argspec: "args=[\'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -1636,6 +1648,10 @@ tf_module {
     name: "py_func"
     argspec: "args=[\'func\', \'inp\', \'Tout\', \'stateful\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
   }
+  member_method {
+    name: "py_function"
+    argspec: "args=[\'func\', \'inp\', \'Tout\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "qr"
     argspec: "args=[\'input\', \'full_matrices\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.saved_model.experimental.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.saved_model.experimental.pbtxt
new file mode 100644
index 00000000000..34343e7c039
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.saved_model.experimental.pbtxt
@@ -0,0 +1,7 @@
+path: "tensorflow.saved_model.experimental"
+tf_module {
+  member_method {
+    name: "save"
+    argspec: "args=[\'obj\', \'export_dir\', \'signatures\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.saved_model.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.saved_model.pbtxt
index 5b28f7b9b18..2055bfbf066 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.saved_model.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.saved_model.pbtxt
@@ -108,6 +108,10 @@ tf_module {
     name: "constants"
     mtype: "<type \'module\'>"
   }
+  member {
+    name: "experimental"
+    mtype: "<type \'module\'>"
+  }
   member {
     name: "loader"
     mtype: "<type \'module\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.signal.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.signal.pbtxt
new file mode 100644
index 00000000000..2c50c41f186
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.signal.pbtxt
@@ -0,0 +1,63 @@
+path: "tensorflow.signal"
+tf_module {
+  member_method {
+    name: "fft"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "fft2d"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "fft3d"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "frame"
+    argspec: "args=[\'signal\', \'frame_length\', \'frame_step\', \'pad_end\', \'pad_value\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'0\', \'-1\', \'None\'], "
+  }
+  member_method {
+    name: "hamming_window"
+    argspec: "args=[\'window_length\', \'periodic\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \"<dtype: \'float32\'>\", \'None\'], "
+  }
+  member_method {
+    name: "hann_window"
+    argspec: "args=[\'window_length\', \'periodic\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \"<dtype: \'float32\'>\", \'None\'], "
+  }
+  member_method {
+    name: "ifft"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ifft2d"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ifft3d"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "inverse_stft"
+    argspec: "args=[\'stfts\', \'frame_length\', \'frame_step\', \'fft_length\', \'window_fn\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'<function hann_window instance>\', \'None\'], "
+  }
+  member_method {
+    name: "inverse_stft_window_fn"
+    argspec: "args=[\'frame_step\', \'forward_window_fn\', \'name\'], varargs=None, keywords=None, defaults=[\'<function hann_window instance>\', \'None\'], "
+  }
+  member_method {
+    name: "linear_to_mel_weight_matrix"
+    argspec: "args=[\'num_mel_bins\', \'num_spectrogram_bins\', \'sample_rate\', \'lower_edge_hertz\', \'upper_edge_hertz\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'20\', \'129\', \'8000\', \'125.0\', \'3800.0\', \"<dtype: \'float32\'>\", \'None\'], "
+  }
+  member_method {
+    name: "mfccs_from_log_mel_spectrograms"
+    argspec: "args=[\'log_mel_spectrograms\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "overlap_and_add"
+    argspec: "args=[\'signal\', \'frame_step\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "stft"
+    argspec: "args=[\'signals\', \'frame_length\', \'frame_step\', \'fft_length\', \'window_fn\', \'pad_end\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'<function hann_window instance>\', \'False\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.train.-checkpoint-manager.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.-checkpoint-manager.pbtxt
new file mode 100644
index 00000000000..2538de661b3
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.train.-checkpoint-manager.pbtxt
@@ -0,0 +1,21 @@
+path: "tensorflow.train.CheckpointManager"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.checkpoint_management.CheckpointManager\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "checkpoints"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "latest_checkpoint"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'checkpoint\', \'directory\', \'max_to_keep\', \'keep_checkpoint_every_n_hours\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "save"
+    argspec: "args=[\'self\', \'checkpoint_number\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.train.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.pbtxt
index 9f353952843..877c55c6b38 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.train.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.train.pbtxt
@@ -24,6 +24,10 @@ tf_module {
     name: "Checkpoint"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "CheckpointManager"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "CheckpointSaverHook"
     mtype: "<type \'type\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-fixed-len-feature.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-fixed-len-feature.pbtxt
deleted file mode 100644
index 6933814a7b6..00000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-fixed-len-feature.pbtxt
+++ /dev/null
@@ -1,27 +0,0 @@
-path: "tensorflow.FixedLenFeature"
-tf_class {
-  is_instance: "<class \'tensorflow.python.ops.parsing_ops.FixedLenFeature\'>"
-  is_instance: "<class \'tensorflow.python.ops.parsing_ops.FixedLenFeature\'>"
-  is_instance: "<type \'tuple\'>"
-  member {
-    name: "default_value"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "dtype"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "shape"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-  }
-  member_method {
-    name: "count"
-  }
-  member_method {
-    name: "index"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-fixed-len-sequence-feature.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-fixed-len-sequence-feature.pbtxt
deleted file mode 100644
index c5387879519..00000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-fixed-len-sequence-feature.pbtxt
+++ /dev/null
@@ -1,31 +0,0 @@
-path: "tensorflow.FixedLenSequenceFeature"
-tf_class {
-  is_instance: "<class \'tensorflow.python.ops.parsing_ops.FixedLenSequenceFeature\'>"
-  is_instance: "<class \'tensorflow.python.ops.parsing_ops.FixedLenSequenceFeature\'>"
-  is_instance: "<type \'tuple\'>"
-  member {
-    name: "allow_missing"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "default_value"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "dtype"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "shape"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-  }
-  member_method {
-    name: "count"
-  }
-  member_method {
-    name: "index"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-sparse-feature.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-sparse-feature.pbtxt
deleted file mode 100644
index d875394fb5d..00000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-sparse-feature.pbtxt
+++ /dev/null
@@ -1,35 +0,0 @@
-path: "tensorflow.SparseFeature"
-tf_class {
-  is_instance: "<class \'tensorflow.python.ops.parsing_ops.SparseFeature\'>"
-  is_instance: "<class \'tensorflow.python.ops.parsing_ops.SparseFeature\'>"
-  is_instance: "<type \'tuple\'>"
-  member {
-    name: "already_sorted"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "dtype"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "index_key"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "size"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "value_key"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-  }
-  member_method {
-    name: "count"
-  }
-  member_method {
-    name: "index"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-var-len-feature.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-var-len-feature.pbtxt
deleted file mode 100644
index 54b66f43f8e..00000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-var-len-feature.pbtxt
+++ /dev/null
@@ -1,19 +0,0 @@
-path: "tensorflow.VarLenFeature"
-tf_class {
-  is_instance: "<class \'tensorflow.python.ops.parsing_ops.VarLenFeature\'>"
-  is_instance: "<class \'tensorflow.python.ops.parsing_ops.VarLenFeature\'>"
-  is_instance: "<type \'tuple\'>"
-  member {
-    name: "dtype"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-  }
-  member_method {
-    name: "count"
-  }
-  member_method {
-    name: "index"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.-fixed-length-record-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.-fixed-length-record-dataset.pbtxt
index a7bfa82c650..81358cecbc0 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.-fixed-length-record-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.-fixed-length-record-dataset.pbtxt
@@ -1,6 +1,7 @@
 path: "tensorflow.data.FixedLengthRecordDataset"
 tf_class {
   is_instance: "<class \'tensorflow.python.data.ops.readers.FixedLengthRecordDataset\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetSource\'>"
   is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.Dataset\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -17,7 +18,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'filenames\', \'record_bytes\', \'header_bytes\', \'footer_bytes\', \'buffer_size\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'filenames\', \'record_bytes\', \'header_bytes\', \'footer_bytes\', \'buffer_size\', \'compression_type\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.-options.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.-options.pbtxt
index 9f4de74c393..9d032d43de1 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.-options.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.-options.pbtxt
@@ -18,10 +18,6 @@ tf_class {
     name: "experimental_hoist_random_uniform"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "experimental_latency_all_edges"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "experimental_map_and_batch_fusion"
     mtype: "<type \'property\'>"
@@ -54,6 +50,10 @@ tf_class {
     name: "experimental_shuffle_and_repeat_fusion"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "experimental_stats"
+    mtype: "<type \'property\'>"
+  }
   member_method {
     name: "__init__"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.-text-line-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.-text-line-dataset.pbtxt
index 2817f900e15..1c305abf68c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.-text-line-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.-text-line-dataset.pbtxt
@@ -1,6 +1,7 @@
 path: "tensorflow.data.TextLineDataset"
 tf_class {
   is_instance: "<class \'tensorflow.python.data.ops.readers.TextLineDataset\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetSource\'>"
   is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.Dataset\'>"
   is_instance: "<type \'object\'>"
   member {
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-stats-aggregator.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-stats-aggregator.pbtxt
index 0bcc8cf3e87..6536a698b50 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-stats-aggregator.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-stats-aggregator.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.data.experimental.StatsAggregator"
 tf_class {
-  is_instance: "<class \'tensorflow.python.data.experimental.ops.stats_ops.StatsAggregator\'>"
+  is_instance: "<class \'tensorflow.python.data.experimental.ops.stats_aggregator.StatsAggregator\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-stats-options.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-stats-options.pbtxt
new file mode 100644
index 00000000000..f423eed42cc
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-stats-options.pbtxt
@@ -0,0 +1,25 @@
+path: "tensorflow.data.experimental.StatsOptions"
+tf_class {
+  is_instance: "<class \'tensorflow.python.data.experimental.ops.stats_options.StatsOptions\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "aggregator"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "counter_prefix"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "latency_all_edges"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "prefix"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'aggregator\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.pbtxt
index 116684e5d81..4c253bb8adf 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.pbtxt
@@ -32,6 +32,10 @@ tf_module {
     name: "StatsAggregator"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "StatsOptions"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "TFRecordWriter"
     mtype: "<type \'type\'>"
@@ -124,10 +128,6 @@ tf_module {
     name: "scan"
     argspec: "args=[\'initial_state\', \'scan_func\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "set_stats_aggregator"
-    argspec: "args=[\'stats_aggregator\', \'tag\', \'counter_prefix\'], varargs=None, keywords=None, defaults=[\'\', \'\'], "
-  }
   member_method {
     name: "shuffle_and_repeat"
     argspec: "args=[\'buffer_size\', \'count\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distributions.-bernoulli.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distributions.-bernoulli.pbtxt
deleted file mode 100644
index ca96f4eaece..00000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.distributions.-bernoulli.pbtxt
+++ /dev/null
@@ -1,143 +0,0 @@
-path: "tensorflow.distributions.Bernoulli"
-tf_class {
-  is_instance: "<class \'tensorflow.python.ops.distributions.bernoulli.Bernoulli\'>"
-  is_instance: "<class \'tensorflow.python.ops.distributions.distribution.Distribution\'>"
-  is_instance: "<class \'tensorflow.python.ops.distributions.distribution._BaseDistribution\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "allow_nan_stats"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "batch_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "dtype"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "event_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "logits"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "parameters"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "probs"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "reparameterization_type"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "validate_args"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'logits\', \'probs\', \'dtype\', \'validate_args\', \'allow_nan_stats\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \"<dtype: \'int32\'>\", \'False\', \'True\', \'Bernoulli\'], "
-  }
-  member_method {
-    name: "batch_shape_tensor"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
-  }
-  member_method {
-    name: "cdf"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'cdf\'], "
-  }
-  member_method {
-    name: "copy"
-    argspec: "args=[\'self\'], varargs=None, keywords=override_parameters_kwargs, defaults=None"
-  }
-  member_method {
-    name: "covariance"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'covariance\'], "
-  }
-  member_method {
-    name: "cross_entropy"
-    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'cross_entropy\'], "
-  }
-  member_method {
-    name: "entropy"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'entropy\'], "
-  }
-  member_method {
-    name: "event_shape_tensor"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'event_shape_tensor\'], "
-  }
-  member_method {
-    name: "is_scalar_batch"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_batch\'], "
-  }
-  member_method {
-    name: "is_scalar_event"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_event\'], "
-  }
-  member_method {
-    name: "kl_divergence"
-    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'kl_divergence\'], "
-  }
-  member_method {
-    name: "log_cdf"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_cdf\'], "
-  }
-  member_method {
-    name: "log_prob"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_prob\'], "
-  }
-  member_method {
-    name: "log_survival_function"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_survival_function\'], "
-  }
-  member_method {
-    name: "mean"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'mean\'], "
-  }
-  member_method {
-    name: "mode"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'mode\'], "
-  }
-  member_method {
-    name: "param_shapes"
-    argspec: "args=[\'cls\', \'sample_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'DistributionParamShapes\'], "
-  }
-  member_method {
-    name: "param_static_shapes"
-    argspec: "args=[\'cls\', \'sample_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "prob"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'prob\'], "
-  }
-  member_method {
-    name: "quantile"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'quantile\'], "
-  }
-  member_method {
-    name: "sample"
-    argspec: "args=[\'self\', \'sample_shape\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'()\', \'None\', \'sample\'], "
-  }
-  member_method {
-    name: "stddev"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'stddev\'], "
-  }
-  member_method {
-    name: "survival_function"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'survival_function\'], "
-  }
-  member_method {
-    name: "variance"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'variance\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distributions.-beta.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distributions.-beta.pbtxt
deleted file mode 100644
index d0508acd9f4..00000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.distributions.-beta.pbtxt
+++ /dev/null
@@ -1,147 +0,0 @@
-path: "tensorflow.distributions.Beta"
-tf_class {
-  is_instance: "<class \'tensorflow.python.ops.distributions.beta.Beta\'>"
-  is_instance: "<class \'tensorflow.python.ops.distributions.distribution.Distribution\'>"
-  is_instance: "<class \'tensorflow.python.ops.distributions.distribution._BaseDistribution\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "allow_nan_stats"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "batch_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "concentration0"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "concentration1"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "dtype"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "event_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "parameters"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "reparameterization_type"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "total_concentration"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "validate_args"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'concentration1\', \'concentration0\', \'validate_args\', \'allow_nan_stats\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\', \'True\', \'Beta\'], "
-  }
-  member_method {
-    name: "batch_shape_tensor"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
-  }
-  member_method {
-    name: "cdf"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'cdf\'], "
-  }
-  member_method {
-    name: "copy"
-    argspec: "args=[\'self\'], varargs=None, keywords=override_parameters_kwargs, defaults=None"
-  }
-  member_method {
-    name: "covariance"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'covariance\'], "
-  }
-  member_method {
-    name: "cross_entropy"
-    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'cross_entropy\'], "
-  }
-  member_method {
-    name: "entropy"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'entropy\'], "
-  }
-  member_method {
-    name: "event_shape_tensor"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'event_shape_tensor\'], "
-  }
-  member_method {
-    name: "is_scalar_batch"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_batch\'], "
-  }
-  member_method {
-    name: "is_scalar_event"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_event\'], "
-  }
-  member_method {
-    name: "kl_divergence"
-    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'kl_divergence\'], "
-  }
-  member_method {
-    name: "log_cdf"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_cdf\'], "
-  }
-  member_method {
-    name: "log_prob"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_prob\'], "
-  }
-  member_method {
-    name: "log_survival_function"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_survival_function\'], "
-  }
-  member_method {
-    name: "mean"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'mean\'], "
-  }
-  member_method {
-    name: "mode"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'mode\'], "
-  }
-  member_method {
-    name: "param_shapes"
-    argspec: "args=[\'cls\', \'sample_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'DistributionParamShapes\'], "
-  }
-  member_method {
-    name: "param_static_shapes"
-    argspec: "args=[\'cls\', \'sample_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "prob"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'prob\'], "
-  }
-  member_method {
-    name: "quantile"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'quantile\'], "
-  }
-  member_method {
-    name: "sample"
-    argspec: "args=[\'self\', \'sample_shape\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'()\', \'None\', \'sample\'], "
-  }
-  member_method {
-    name: "stddev"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'stddev\'], "
-  }
-  member_method {
-    name: "survival_function"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'survival_function\'], "
-  }
-  member_method {
-    name: "variance"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'variance\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distributions.-categorical.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distributions.-categorical.pbtxt
deleted file mode 100644
index ff0fbb56cd4..00000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.distributions.-categorical.pbtxt
+++ /dev/null
@@ -1,147 +0,0 @@
-path: "tensorflow.distributions.Categorical"
-tf_class {
-  is_instance: "<class \'tensorflow.python.ops.distributions.categorical.Categorical\'>"
-  is_instance: "<class \'tensorflow.python.ops.distributions.distribution.Distribution\'>"
-  is_instance: "<class \'tensorflow.python.ops.distributions.distribution._BaseDistribution\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "allow_nan_stats"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "batch_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "dtype"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "event_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "event_size"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "logits"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "parameters"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "probs"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "reparameterization_type"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "validate_args"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'logits\', \'probs\', \'dtype\', \'validate_args\', \'allow_nan_stats\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \"<dtype: \'int32\'>\", \'False\', \'True\', \'Categorical\'], "
-  }
-  member_method {
-    name: "batch_shape_tensor"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
-  }
-  member_method {
-    name: "cdf"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'cdf\'], "
-  }
-  member_method {
-    name: "copy"
-    argspec: "args=[\'self\'], varargs=None, keywords=override_parameters_kwargs, defaults=None"
-  }
-  member_method {
-    name: "covariance"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'covariance\'], "
-  }
-  member_method {
-    name: "cross_entropy"
-    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'cross_entropy\'], "
-  }
-  member_method {
-    name: "entropy"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'entropy\'], "
-  }
-  member_method {
-    name: "event_shape_tensor"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'event_shape_tensor\'], "
-  }
-  member_method {
-    name: "is_scalar_batch"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_batch\'], "
-  }
-  member_method {
-    name: "is_scalar_event"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_event\'], "
-  }
-  member_method {
-    name: "kl_divergence"
-    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'kl_divergence\'], "
-  }
-  member_method {
-    name: "log_cdf"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_cdf\'], "
-  }
-  member_method {
-    name: "log_prob"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_prob\'], "
-  }
-  member_method {
-    name: "log_survival_function"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_survival_function\'], "
-  }
-  member_method {
-    name: "mean"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'mean\'], "
-  }
-  member_method {
-    name: "mode"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'mode\'], "
-  }
-  member_method {
-    name: "param_shapes"
-    argspec: "args=[\'cls\', \'sample_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'DistributionParamShapes\'], "
-  }
-  member_method {
-    name: "param_static_shapes"
-    argspec: "args=[\'cls\', \'sample_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "prob"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'prob\'], "
-  }
-  member_method {
-    name: "quantile"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'quantile\'], "
-  }
-  member_method {
-    name: "sample"
-    argspec: "args=[\'self\', \'sample_shape\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'()\', \'None\', \'sample\'], "
-  }
-  member_method {
-    name: "stddev"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'stddev\'], "
-  }
-  member_method {
-    name: "survival_function"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'survival_function\'], "
-  }
-  member_method {
-    name: "variance"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'variance\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distributions.-dirichlet-multinomial.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distributions.-dirichlet-multinomial.pbtxt
deleted file mode 100644
index d75e4a2f88b..00000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.distributions.-dirichlet-multinomial.pbtxt
+++ /dev/null
@@ -1,147 +0,0 @@
-path: "tensorflow.distributions.DirichletMultinomial"
-tf_class {
-  is_instance: "<class \'tensorflow.python.ops.distributions.dirichlet_multinomial.DirichletMultinomial\'>"
-  is_instance: "<class \'tensorflow.python.ops.distributions.distribution.Distribution\'>"
-  is_instance: "<class \'tensorflow.python.ops.distributions.distribution._BaseDistribution\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "allow_nan_stats"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "batch_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "concentration"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "dtype"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "event_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "parameters"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "reparameterization_type"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "total_concentration"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "total_count"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "validate_args"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'total_count\', \'concentration\', \'validate_args\', \'allow_nan_stats\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'True\', \'DirichletMultinomial\'], "
-  }
-  member_method {
-    name: "batch_shape_tensor"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
-  }
-  member_method {
-    name: "cdf"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'cdf\'], "
-  }
-  member_method {
-    name: "copy"
-    argspec: "args=[\'self\'], varargs=None, keywords=override_parameters_kwargs, defaults=None"
-  }
-  member_method {
-    name: "covariance"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'covariance\'], "
-  }
-  member_method {
-    name: "cross_entropy"
-    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'cross_entropy\'], "
-  }
-  member_method {
-    name: "entropy"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'entropy\'], "
-  }
-  member_method {
-    name: "event_shape_tensor"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'event_shape_tensor\'], "
-  }
-  member_method {
-    name: "is_scalar_batch"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_batch\'], "
-  }
-  member_method {
-    name: "is_scalar_event"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_event\'], "
-  }
-  member_method {
-    name: "kl_divergence"
-    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'kl_divergence\'], "
-  }
-  member_method {
-    name: "log_cdf"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_cdf\'], "
-  }
-  member_method {
-    name: "log_prob"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_prob\'], "
-  }
-  member_method {
-    name: "log_survival_function"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_survival_function\'], "
-  }
-  member_method {
-    name: "mean"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'mean\'], "
-  }
-  member_method {
-    name: "mode"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'mode\'], "
-  }
-  member_method {
-    name: "param_shapes"
-    argspec: "args=[\'cls\', \'sample_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'DistributionParamShapes\'], "
-  }
-  member_method {
-    name: "param_static_shapes"
-    argspec: "args=[\'cls\', \'sample_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "prob"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'prob\'], "
-  }
-  member_method {
-    name: "quantile"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'quantile\'], "
-  }
-  member_method {
-    name: "sample"
-    argspec: "args=[\'self\', \'sample_shape\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'()\', \'None\', \'sample\'], "
-  }
-  member_method {
-    name: "stddev"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'stddev\'], "
-  }
-  member_method {
-    name: "survival_function"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'survival_function\'], "
-  }
-  member_method {
-    name: "variance"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'variance\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distributions.-dirichlet.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distributions.-dirichlet.pbtxt
deleted file mode 100644
index b838b9ae21d..00000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.distributions.-dirichlet.pbtxt
+++ /dev/null
@@ -1,143 +0,0 @@
-path: "tensorflow.distributions.Dirichlet"
-tf_class {
-  is_instance: "<class \'tensorflow.python.ops.distributions.dirichlet.Dirichlet\'>"
-  is_instance: "<class \'tensorflow.python.ops.distributions.distribution.Distribution\'>"
-  is_instance: "<class \'tensorflow.python.ops.distributions.distribution._BaseDistribution\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "allow_nan_stats"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "batch_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "concentration"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "dtype"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "event_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "parameters"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "reparameterization_type"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "total_concentration"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "validate_args"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'concentration\', \'validate_args\', \'allow_nan_stats\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'True\', \'Dirichlet\'], "
-  }
-  member_method {
-    name: "batch_shape_tensor"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
-  }
-  member_method {
-    name: "cdf"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'cdf\'], "
-  }
-  member_method {
-    name: "copy"
-    argspec: "args=[\'self\'], varargs=None, keywords=override_parameters_kwargs, defaults=None"
-  }
-  member_method {
-    name: "covariance"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'covariance\'], "
-  }
-  member_method {
-    name: "cross_entropy"
-    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'cross_entropy\'], "
-  }
-  member_method {
-    name: "entropy"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'entropy\'], "
-  }
-  member_method {
-    name: "event_shape_tensor"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'event_shape_tensor\'], "
-  }
-  member_method {
-    name: "is_scalar_batch"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_batch\'], "
-  }
-  member_method {
-    name: "is_scalar_event"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_event\'], "
-  }
-  member_method {
-    name: "kl_divergence"
-    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'kl_divergence\'], "
-  }
-  member_method {
-    name: "log_cdf"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_cdf\'], "
-  }
-  member_method {
-    name: "log_prob"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_prob\'], "
-  }
-  member_method {
-    name: "log_survival_function"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_survival_function\'], "
-  }
-  member_method {
-    name: "mean"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'mean\'], "
-  }
-  member_method {
-    name: "mode"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'mode\'], "
-  }
-  member_method {
-    name: "param_shapes"
-    argspec: "args=[\'cls\', \'sample_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'DistributionParamShapes\'], "
-  }
-  member_method {
-    name: "param_static_shapes"
-    argspec: "args=[\'cls\', \'sample_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "prob"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'prob\'], "
-  }
-  member_method {
-    name: "quantile"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'quantile\'], "
-  }
-  member_method {
-    name: "sample"
-    argspec: "args=[\'self\', \'sample_shape\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'()\', \'None\', \'sample\'], "
-  }
-  member_method {
-    name: "stddev"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'stddev\'], "
-  }
-  member_method {
-    name: "survival_function"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'survival_function\'], "
-  }
-  member_method {
-    name: "variance"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'variance\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distributions.-distribution.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distributions.-distribution.pbtxt
deleted file mode 100644
index 6f06b7d50dd..00000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.distributions.-distribution.pbtxt
+++ /dev/null
@@ -1,134 +0,0 @@
-path: "tensorflow.distributions.Distribution"
-tf_class {
-  is_instance: "<class \'tensorflow.python.ops.distributions.distribution.Distribution\'>"
-  is_instance: "<class \'tensorflow.python.ops.distributions.distribution._BaseDistribution\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "allow_nan_stats"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "batch_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "dtype"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "event_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "parameters"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "reparameterization_type"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "validate_args"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'dtype\', \'reparameterization_type\', \'validate_args\', \'allow_nan_stats\', \'parameters\', \'graph_parents\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "batch_shape_tensor"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
-  }
-  member_method {
-    name: "cdf"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'cdf\'], "
-  }
-  member_method {
-    name: "copy"
-    argspec: "args=[\'self\'], varargs=None, keywords=override_parameters_kwargs, defaults=None"
-  }
-  member_method {
-    name: "covariance"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'covariance\'], "
-  }
-  member_method {
-    name: "cross_entropy"
-    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'cross_entropy\'], "
-  }
-  member_method {
-    name: "entropy"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'entropy\'], "
-  }
-  member_method {
-    name: "event_shape_tensor"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'event_shape_tensor\'], "
-  }
-  member_method {
-    name: "is_scalar_batch"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_batch\'], "
-  }
-  member_method {
-    name: "is_scalar_event"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_event\'], "
-  }
-  member_method {
-    name: "kl_divergence"
-    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'kl_divergence\'], "
-  }
-  member_method {
-    name: "log_cdf"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_cdf\'], "
-  }
-  member_method {
-    name: "log_prob"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_prob\'], "
-  }
-  member_method {
-    name: "log_survival_function"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_survival_function\'], "
-  }
-  member_method {
-    name: "mean"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'mean\'], "
-  }
-  member_method {
-    name: "mode"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'mode\'], "
-  }
-  member_method {
-    name: "param_shapes"
-    argspec: "args=[\'cls\', \'sample_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'DistributionParamShapes\'], "
-  }
-  member_method {
-    name: "param_static_shapes"
-    argspec: "args=[\'cls\', \'sample_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "prob"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'prob\'], "
-  }
-  member_method {
-    name: "quantile"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'quantile\'], "
-  }
-  member_method {
-    name: "sample"
-    argspec: "args=[\'self\', \'sample_shape\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'()\', \'None\', \'sample\'], "
-  }
-  member_method {
-    name: "stddev"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'stddev\'], "
-  }
-  member_method {
-    name: "survival_function"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'survival_function\'], "
-  }
-  member_method {
-    name: "variance"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'variance\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distributions.-exponential.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distributions.-exponential.pbtxt
deleted file mode 100644
index d34f9cde5d4..00000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.distributions.-exponential.pbtxt
+++ /dev/null
@@ -1,144 +0,0 @@
-path: "tensorflow.distributions.Exponential"
-tf_class {
-  is_instance: "<class \'tensorflow.python.ops.distributions.exponential.Exponential\'>"
-  is_instance: "<class \'tensorflow.python.ops.distributions.gamma.Gamma\'>"
-  is_instance: "<class \'tensorflow.python.ops.distributions.distribution.Distribution\'>"
-  is_instance: "<class \'tensorflow.python.ops.distributions.distribution._BaseDistribution\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "allow_nan_stats"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "batch_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "concentration"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "dtype"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "event_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "parameters"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "rate"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "reparameterization_type"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "validate_args"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'rate\', \'validate_args\', \'allow_nan_stats\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'True\', \'Exponential\'], "
-  }
-  member_method {
-    name: "batch_shape_tensor"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
-  }
-  member_method {
-    name: "cdf"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'cdf\'], "
-  }
-  member_method {
-    name: "copy"
-    argspec: "args=[\'self\'], varargs=None, keywords=override_parameters_kwargs, defaults=None"
-  }
-  member_method {
-    name: "covariance"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'covariance\'], "
-  }
-  member_method {
-    name: "cross_entropy"
-    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'cross_entropy\'], "
-  }
-  member_method {
-    name: "entropy"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'entropy\'], "
-  }
-  member_method {
-    name: "event_shape_tensor"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'event_shape_tensor\'], "
-  }
-  member_method {
-    name: "is_scalar_batch"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_batch\'], "
-  }
-  member_method {
-    name: "is_scalar_event"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_event\'], "
-  }
-  member_method {
-    name: "kl_divergence"
-    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'kl_divergence\'], "
-  }
-  member_method {
-    name: "log_cdf"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_cdf\'], "
-  }
-  member_method {
-    name: "log_prob"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_prob\'], "
-  }
-  member_method {
-    name: "log_survival_function"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_survival_function\'], "
-  }
-  member_method {
-    name: "mean"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'mean\'], "
-  }
-  member_method {
-    name: "mode"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'mode\'], "
-  }
-  member_method {
-    name: "param_shapes"
-    argspec: "args=[\'cls\', \'sample_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'DistributionParamShapes\'], "
-  }
-  member_method {
-    name: "param_static_shapes"
-    argspec: "args=[\'cls\', \'sample_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "prob"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'prob\'], "
-  }
-  member_method {
-    name: "quantile"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'quantile\'], "
-  }
-  member_method {
-    name: "sample"
-    argspec: "args=[\'self\', \'sample_shape\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'()\', \'None\', \'sample\'], "
-  }
-  member_method {
-    name: "stddev"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'stddev\'], "
-  }
-  member_method {
-    name: "survival_function"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'survival_function\'], "
-  }
-  member_method {
-    name: "variance"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'variance\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distributions.-gamma.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distributions.-gamma.pbtxt
deleted file mode 100644
index df268b8d99e..00000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.distributions.-gamma.pbtxt
+++ /dev/null
@@ -1,143 +0,0 @@
-path: "tensorflow.distributions.Gamma"
-tf_class {
-  is_instance: "<class \'tensorflow.python.ops.distributions.gamma.Gamma\'>"
-  is_instance: "<class \'tensorflow.python.ops.distributions.distribution.Distribution\'>"
-  is_instance: "<class \'tensorflow.python.ops.distributions.distribution._BaseDistribution\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "allow_nan_stats"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "batch_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "concentration"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "dtype"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "event_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "parameters"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "rate"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "reparameterization_type"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "validate_args"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'concentration\', \'rate\', \'validate_args\', \'allow_nan_stats\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'True\', \'Gamma\'], "
-  }
-  member_method {
-    name: "batch_shape_tensor"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
-  }
-  member_method {
-    name: "cdf"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'cdf\'], "
-  }
-  member_method {
-    name: "copy"
-    argspec: "args=[\'self\'], varargs=None, keywords=override_parameters_kwargs, defaults=None"
-  }
-  member_method {
-    name: "covariance"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'covariance\'], "
-  }
-  member_method {
-    name: "cross_entropy"
-    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'cross_entropy\'], "
-  }
-  member_method {
-    name: "entropy"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'entropy\'], "
-  }
-  member_method {
-    name: "event_shape_tensor"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'event_shape_tensor\'], "
-  }
-  member_method {
-    name: "is_scalar_batch"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_batch\'], "
-  }
-  member_method {
-    name: "is_scalar_event"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_event\'], "
-  }
-  member_method {
-    name: "kl_divergence"
-    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'kl_divergence\'], "
-  }
-  member_method {
-    name: "log_cdf"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_cdf\'], "
-  }
-  member_method {
-    name: "log_prob"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_prob\'], "
-  }
-  member_method {
-    name: "log_survival_function"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_survival_function\'], "
-  }
-  member_method {
-    name: "mean"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'mean\'], "
-  }
-  member_method {
-    name: "mode"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'mode\'], "
-  }
-  member_method {
-    name: "param_shapes"
-    argspec: "args=[\'cls\', \'sample_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'DistributionParamShapes\'], "
-  }
-  member_method {
-    name: "param_static_shapes"
-    argspec: "args=[\'cls\', \'sample_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "prob"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'prob\'], "
-  }
-  member_method {
-    name: "quantile"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'quantile\'], "
-  }
-  member_method {
-    name: "sample"
-    argspec: "args=[\'self\', \'sample_shape\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'()\', \'None\', \'sample\'], "
-  }
-  member_method {
-    name: "stddev"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'stddev\'], "
-  }
-  member_method {
-    name: "survival_function"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'survival_function\'], "
-  }
-  member_method {
-    name: "variance"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'variance\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distributions.-laplace.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distributions.-laplace.pbtxt
deleted file mode 100644
index 303dcb4ed3b..00000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.distributions.-laplace.pbtxt
+++ /dev/null
@@ -1,143 +0,0 @@
-path: "tensorflow.distributions.Laplace"
-tf_class {
-  is_instance: "<class \'tensorflow.python.ops.distributions.laplace.Laplace\'>"
-  is_instance: "<class \'tensorflow.python.ops.distributions.distribution.Distribution\'>"
-  is_instance: "<class \'tensorflow.python.ops.distributions.distribution._BaseDistribution\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "allow_nan_stats"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "batch_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "dtype"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "event_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "loc"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "parameters"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "reparameterization_type"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "scale"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "validate_args"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'loc\', \'scale\', \'validate_args\', \'allow_nan_stats\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'True\', \'Laplace\'], "
-  }
-  member_method {
-    name: "batch_shape_tensor"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
-  }
-  member_method {
-    name: "cdf"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'cdf\'], "
-  }
-  member_method {
-    name: "copy"
-    argspec: "args=[\'self\'], varargs=None, keywords=override_parameters_kwargs, defaults=None"
-  }
-  member_method {
-    name: "covariance"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'covariance\'], "
-  }
-  member_method {
-    name: "cross_entropy"
-    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'cross_entropy\'], "
-  }
-  member_method {
-    name: "entropy"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'entropy\'], "
-  }
-  member_method {
-    name: "event_shape_tensor"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'event_shape_tensor\'], "
-  }
-  member_method {
-    name: "is_scalar_batch"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_batch\'], "
-  }
-  member_method {
-    name: "is_scalar_event"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_event\'], "
-  }
-  member_method {
-    name: "kl_divergence"
-    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'kl_divergence\'], "
-  }
-  member_method {
-    name: "log_cdf"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_cdf\'], "
-  }
-  member_method {
-    name: "log_prob"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_prob\'], "
-  }
-  member_method {
-    name: "log_survival_function"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_survival_function\'], "
-  }
-  member_method {
-    name: "mean"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'mean\'], "
-  }
-  member_method {
-    name: "mode"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'mode\'], "
-  }
-  member_method {
-    name: "param_shapes"
-    argspec: "args=[\'cls\', \'sample_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'DistributionParamShapes\'], "
-  }
-  member_method {
-    name: "param_static_shapes"
-    argspec: "args=[\'cls\', \'sample_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "prob"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'prob\'], "
-  }
-  member_method {
-    name: "quantile"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'quantile\'], "
-  }
-  member_method {
-    name: "sample"
-    argspec: "args=[\'self\', \'sample_shape\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'()\', \'None\', \'sample\'], "
-  }
-  member_method {
-    name: "stddev"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'stddev\'], "
-  }
-  member_method {
-    name: "survival_function"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'survival_function\'], "
-  }
-  member_method {
-    name: "variance"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'variance\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distributions.-multinomial.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distributions.-multinomial.pbtxt
deleted file mode 100644
index ecda8acb15c..00000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.distributions.-multinomial.pbtxt
+++ /dev/null
@@ -1,147 +0,0 @@
-path: "tensorflow.distributions.Multinomial"
-tf_class {
-  is_instance: "<class \'tensorflow.python.ops.distributions.multinomial.Multinomial\'>"
-  is_instance: "<class \'tensorflow.python.ops.distributions.distribution.Distribution\'>"
-  is_instance: "<class \'tensorflow.python.ops.distributions.distribution._BaseDistribution\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "allow_nan_stats"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "batch_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "dtype"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "event_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "logits"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "parameters"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "probs"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "reparameterization_type"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "total_count"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "validate_args"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'total_count\', \'logits\', \'probs\', \'validate_args\', \'allow_nan_stats\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\', \'True\', \'Multinomial\'], "
-  }
-  member_method {
-    name: "batch_shape_tensor"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
-  }
-  member_method {
-    name: "cdf"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'cdf\'], "
-  }
-  member_method {
-    name: "copy"
-    argspec: "args=[\'self\'], varargs=None, keywords=override_parameters_kwargs, defaults=None"
-  }
-  member_method {
-    name: "covariance"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'covariance\'], "
-  }
-  member_method {
-    name: "cross_entropy"
-    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'cross_entropy\'], "
-  }
-  member_method {
-    name: "entropy"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'entropy\'], "
-  }
-  member_method {
-    name: "event_shape_tensor"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'event_shape_tensor\'], "
-  }
-  member_method {
-    name: "is_scalar_batch"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_batch\'], "
-  }
-  member_method {
-    name: "is_scalar_event"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_event\'], "
-  }
-  member_method {
-    name: "kl_divergence"
-    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'kl_divergence\'], "
-  }
-  member_method {
-    name: "log_cdf"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_cdf\'], "
-  }
-  member_method {
-    name: "log_prob"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_prob\'], "
-  }
-  member_method {
-    name: "log_survival_function"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_survival_function\'], "
-  }
-  member_method {
-    name: "mean"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'mean\'], "
-  }
-  member_method {
-    name: "mode"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'mode\'], "
-  }
-  member_method {
-    name: "param_shapes"
-    argspec: "args=[\'cls\', \'sample_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'DistributionParamShapes\'], "
-  }
-  member_method {
-    name: "param_static_shapes"
-    argspec: "args=[\'cls\', \'sample_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "prob"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'prob\'], "
-  }
-  member_method {
-    name: "quantile"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'quantile\'], "
-  }
-  member_method {
-    name: "sample"
-    argspec: "args=[\'self\', \'sample_shape\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'()\', \'None\', \'sample\'], "
-  }
-  member_method {
-    name: "stddev"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'stddev\'], "
-  }
-  member_method {
-    name: "survival_function"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'survival_function\'], "
-  }
-  member_method {
-    name: "variance"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'variance\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distributions.-normal.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distributions.-normal.pbtxt
deleted file mode 100644
index 92b9eeea223..00000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.distributions.-normal.pbtxt
+++ /dev/null
@@ -1,143 +0,0 @@
-path: "tensorflow.distributions.Normal"
-tf_class {
-  is_instance: "<class \'tensorflow.python.ops.distributions.normal.Normal\'>"
-  is_instance: "<class \'tensorflow.python.ops.distributions.distribution.Distribution\'>"
-  is_instance: "<class \'tensorflow.python.ops.distributions.distribution._BaseDistribution\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "allow_nan_stats"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "batch_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "dtype"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "event_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "loc"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "parameters"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "reparameterization_type"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "scale"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "validate_args"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'loc\', \'scale\', \'validate_args\', \'allow_nan_stats\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'True\', \'Normal\'], "
-  }
-  member_method {
-    name: "batch_shape_tensor"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
-  }
-  member_method {
-    name: "cdf"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'cdf\'], "
-  }
-  member_method {
-    name: "copy"
-    argspec: "args=[\'self\'], varargs=None, keywords=override_parameters_kwargs, defaults=None"
-  }
-  member_method {
-    name: "covariance"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'covariance\'], "
-  }
-  member_method {
-    name: "cross_entropy"
-    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'cross_entropy\'], "
-  }
-  member_method {
-    name: "entropy"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'entropy\'], "
-  }
-  member_method {
-    name: "event_shape_tensor"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'event_shape_tensor\'], "
-  }
-  member_method {
-    name: "is_scalar_batch"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_batch\'], "
-  }
-  member_method {
-    name: "is_scalar_event"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_event\'], "
-  }
-  member_method {
-    name: "kl_divergence"
-    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'kl_divergence\'], "
-  }
-  member_method {
-    name: "log_cdf"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_cdf\'], "
-  }
-  member_method {
-    name: "log_prob"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_prob\'], "
-  }
-  member_method {
-    name: "log_survival_function"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_survival_function\'], "
-  }
-  member_method {
-    name: "mean"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'mean\'], "
-  }
-  member_method {
-    name: "mode"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'mode\'], "
-  }
-  member_method {
-    name: "param_shapes"
-    argspec: "args=[\'cls\', \'sample_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'DistributionParamShapes\'], "
-  }
-  member_method {
-    name: "param_static_shapes"
-    argspec: "args=[\'cls\', \'sample_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "prob"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'prob\'], "
-  }
-  member_method {
-    name: "quantile"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'quantile\'], "
-  }
-  member_method {
-    name: "sample"
-    argspec: "args=[\'self\', \'sample_shape\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'()\', \'None\', \'sample\'], "
-  }
-  member_method {
-    name: "stddev"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'stddev\'], "
-  }
-  member_method {
-    name: "survival_function"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'survival_function\'], "
-  }
-  member_method {
-    name: "variance"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'variance\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distributions.-register-k-l.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distributions.-register-k-l.pbtxt
deleted file mode 100644
index e3db443c2bd..00000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.distributions.-register-k-l.pbtxt
+++ /dev/null
@@ -1,9 +0,0 @@
-path: "tensorflow.distributions.RegisterKL"
-tf_class {
-  is_instance: "<class \'tensorflow.python.ops.distributions.kullback_leibler.RegisterKL\'>"
-  is_instance: "<type \'object\'>"
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'dist_cls_a\', \'dist_cls_b\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distributions.-reparameterization-type.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distributions.-reparameterization-type.pbtxt
deleted file mode 100644
index 02e8d576ddd..00000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.distributions.-reparameterization-type.pbtxt
+++ /dev/null
@@ -1,9 +0,0 @@
-path: "tensorflow.distributions.ReparameterizationType"
-tf_class {
-  is_instance: "<class \'tensorflow.python.ops.distributions.distribution.ReparameterizationType\'>"
-  is_instance: "<type \'object\'>"
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'rep_type\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distributions.-student-t.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distributions.-student-t.pbtxt
deleted file mode 100644
index 9aa7f9a6346..00000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.distributions.-student-t.pbtxt
+++ /dev/null
@@ -1,147 +0,0 @@
-path: "tensorflow.distributions.StudentT"
-tf_class {
-  is_instance: "<class \'tensorflow.python.ops.distributions.student_t.StudentT\'>"
-  is_instance: "<class \'tensorflow.python.ops.distributions.distribution.Distribution\'>"
-  is_instance: "<class \'tensorflow.python.ops.distributions.distribution._BaseDistribution\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "allow_nan_stats"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "batch_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "df"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "dtype"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "event_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "loc"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "parameters"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "reparameterization_type"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "scale"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "validate_args"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'df\', \'loc\', \'scale\', \'validate_args\', \'allow_nan_stats\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'True\', \'StudentT\'], "
-  }
-  member_method {
-    name: "batch_shape_tensor"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
-  }
-  member_method {
-    name: "cdf"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'cdf\'], "
-  }
-  member_method {
-    name: "copy"
-    argspec: "args=[\'self\'], varargs=None, keywords=override_parameters_kwargs, defaults=None"
-  }
-  member_method {
-    name: "covariance"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'covariance\'], "
-  }
-  member_method {
-    name: "cross_entropy"
-    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'cross_entropy\'], "
-  }
-  member_method {
-    name: "entropy"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'entropy\'], "
-  }
-  member_method {
-    name: "event_shape_tensor"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'event_shape_tensor\'], "
-  }
-  member_method {
-    name: "is_scalar_batch"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_batch\'], "
-  }
-  member_method {
-    name: "is_scalar_event"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_event\'], "
-  }
-  member_method {
-    name: "kl_divergence"
-    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'kl_divergence\'], "
-  }
-  member_method {
-    name: "log_cdf"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_cdf\'], "
-  }
-  member_method {
-    name: "log_prob"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_prob\'], "
-  }
-  member_method {
-    name: "log_survival_function"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_survival_function\'], "
-  }
-  member_method {
-    name: "mean"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'mean\'], "
-  }
-  member_method {
-    name: "mode"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'mode\'], "
-  }
-  member_method {
-    name: "param_shapes"
-    argspec: "args=[\'cls\', \'sample_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'DistributionParamShapes\'], "
-  }
-  member_method {
-    name: "param_static_shapes"
-    argspec: "args=[\'cls\', \'sample_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "prob"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'prob\'], "
-  }
-  member_method {
-    name: "quantile"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'quantile\'], "
-  }
-  member_method {
-    name: "sample"
-    argspec: "args=[\'self\', \'sample_shape\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'()\', \'None\', \'sample\'], "
-  }
-  member_method {
-    name: "stddev"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'stddev\'], "
-  }
-  member_method {
-    name: "survival_function"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'survival_function\'], "
-  }
-  member_method {
-    name: "variance"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'variance\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distributions.-uniform.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distributions.-uniform.pbtxt
deleted file mode 100644
index d1b9d306962..00000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.distributions.-uniform.pbtxt
+++ /dev/null
@@ -1,147 +0,0 @@
-path: "tensorflow.distributions.Uniform"
-tf_class {
-  is_instance: "<class \'tensorflow.python.ops.distributions.uniform.Uniform\'>"
-  is_instance: "<class \'tensorflow.python.ops.distributions.distribution.Distribution\'>"
-  is_instance: "<class \'tensorflow.python.ops.distributions.distribution._BaseDistribution\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "allow_nan_stats"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "batch_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "dtype"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "event_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "high"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "low"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "parameters"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "reparameterization_type"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "validate_args"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'low\', \'high\', \'validate_args\', \'allow_nan_stats\', \'name\'], varargs=None, keywords=None, defaults=[\'0.0\', \'1.0\', \'False\', \'True\', \'Uniform\'], "
-  }
-  member_method {
-    name: "batch_shape_tensor"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
-  }
-  member_method {
-    name: "cdf"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'cdf\'], "
-  }
-  member_method {
-    name: "copy"
-    argspec: "args=[\'self\'], varargs=None, keywords=override_parameters_kwargs, defaults=None"
-  }
-  member_method {
-    name: "covariance"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'covariance\'], "
-  }
-  member_method {
-    name: "cross_entropy"
-    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'cross_entropy\'], "
-  }
-  member_method {
-    name: "entropy"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'entropy\'], "
-  }
-  member_method {
-    name: "event_shape_tensor"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'event_shape_tensor\'], "
-  }
-  member_method {
-    name: "is_scalar_batch"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_batch\'], "
-  }
-  member_method {
-    name: "is_scalar_event"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_event\'], "
-  }
-  member_method {
-    name: "kl_divergence"
-    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'kl_divergence\'], "
-  }
-  member_method {
-    name: "log_cdf"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_cdf\'], "
-  }
-  member_method {
-    name: "log_prob"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_prob\'], "
-  }
-  member_method {
-    name: "log_survival_function"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_survival_function\'], "
-  }
-  member_method {
-    name: "mean"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'mean\'], "
-  }
-  member_method {
-    name: "mode"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'mode\'], "
-  }
-  member_method {
-    name: "param_shapes"
-    argspec: "args=[\'cls\', \'sample_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'DistributionParamShapes\'], "
-  }
-  member_method {
-    name: "param_static_shapes"
-    argspec: "args=[\'cls\', \'sample_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "prob"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'prob\'], "
-  }
-  member_method {
-    name: "quantile"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'quantile\'], "
-  }
-  member_method {
-    name: "range"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'range\'], "
-  }
-  member_method {
-    name: "sample"
-    argspec: "args=[\'self\', \'sample_shape\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'()\', \'None\', \'sample\'], "
-  }
-  member_method {
-    name: "stddev"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'stddev\'], "
-  }
-  member_method {
-    name: "survival_function"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'survival_function\'], "
-  }
-  member_method {
-    name: "variance"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'variance\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distributions.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distributions.pbtxt
deleted file mode 100644
index 90b60ef074d..00000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.distributions.pbtxt
+++ /dev/null
@@ -1,75 +0,0 @@
-path: "tensorflow.distributions"
-tf_module {
-  member {
-    name: "Bernoulli"
-    mtype: "<class \'tensorflow.python.ops.distributions.distribution._DistributionMeta\'>"
-  }
-  member {
-    name: "Beta"
-    mtype: "<class \'tensorflow.python.ops.distributions.distribution._DistributionMeta\'>"
-  }
-  member {
-    name: "Categorical"
-    mtype: "<class \'tensorflow.python.ops.distributions.distribution._DistributionMeta\'>"
-  }
-  member {
-    name: "Dirichlet"
-    mtype: "<class \'tensorflow.python.ops.distributions.distribution._DistributionMeta\'>"
-  }
-  member {
-    name: "DirichletMultinomial"
-    mtype: "<class \'tensorflow.python.ops.distributions.distribution._DistributionMeta\'>"
-  }
-  member {
-    name: "Distribution"
-    mtype: "<class \'tensorflow.python.ops.distributions.distribution._DistributionMeta\'>"
-  }
-  member {
-    name: "Exponential"
-    mtype: "<class \'tensorflow.python.ops.distributions.distribution._DistributionMeta\'>"
-  }
-  member {
-    name: "FULLY_REPARAMETERIZED"
-    mtype: "<class \'tensorflow.python.ops.distributions.distribution.ReparameterizationType\'>"
-  }
-  member {
-    name: "Gamma"
-    mtype: "<class \'tensorflow.python.ops.distributions.distribution._DistributionMeta\'>"
-  }
-  member {
-    name: "Laplace"
-    mtype: "<class \'tensorflow.python.ops.distributions.distribution._DistributionMeta\'>"
-  }
-  member {
-    name: "Multinomial"
-    mtype: "<class \'tensorflow.python.ops.distributions.distribution._DistributionMeta\'>"
-  }
-  member {
-    name: "NOT_REPARAMETERIZED"
-    mtype: "<class \'tensorflow.python.ops.distributions.distribution.ReparameterizationType\'>"
-  }
-  member {
-    name: "Normal"
-    mtype: "<class \'tensorflow.python.ops.distributions.distribution._DistributionMeta\'>"
-  }
-  member {
-    name: "RegisterKL"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "ReparameterizationType"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "StudentT"
-    mtype: "<class \'tensorflow.python.ops.distributions.distribution._DistributionMeta\'>"
-  }
-  member {
-    name: "Uniform"
-    mtype: "<class \'tensorflow.python.ops.distributions.distribution._DistributionMeta\'>"
-  }
-  member_method {
-    name: "kl_divergence"
-    argspec: "args=[\'distribution_a\', \'distribution_b\', \'allow_nan_stats\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-boosted-trees-classifier.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-boosted-trees-classifier.pbtxt
index fa352907c0b..d530c71482a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-boosted-trees-classifier.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-boosted-trees-classifier.pbtxt
@@ -22,7 +22,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'feature_columns\', \'n_batches_per_layer\', \'model_dir\', \'n_classes\', \'weight_column\', \'label_vocabulary\', \'n_trees\', \'max_depth\', \'learning_rate\', \'l1_regularization\', \'l2_regularization\', \'tree_complexity\', \'min_node_weight\', \'config\', \'center_bias\', \'pruning_mode\'], varargs=None, keywords=None, defaults=[\'None\', \'<object object instance>\', \'None\', \'None\', \'100\', \'6\', \'0.1\', \'0.0\', \'0.0\', \'0.0\', \'0.0\', \'None\', \'False\', \'none\'], "
+    argspec: "args=[\'self\', \'feature_columns\', \'n_batches_per_layer\', \'model_dir\', \'n_classes\', \'weight_column\', \'label_vocabulary\', \'n_trees\', \'max_depth\', \'learning_rate\', \'l1_regularization\', \'l2_regularization\', \'tree_complexity\', \'min_node_weight\', \'config\', \'center_bias\', \'pruning_mode\', \'quantile_sketch_epsilon\'], varargs=None, keywords=None, defaults=[\'None\', \'<object object instance>\', \'None\', \'None\', \'100\', \'6\', \'0.1\', \'0.0\', \'0.0\', \'0.0\', \'0.0\', \'None\', \'False\', \'none\', \'0.01\'], "
   }
   member_method {
     name: "eval_dir"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-boosted-trees-regressor.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-boosted-trees-regressor.pbtxt
index 154b35f3064..4703c0f561a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-boosted-trees-regressor.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-boosted-trees-regressor.pbtxt
@@ -22,7 +22,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'feature_columns\', \'n_batches_per_layer\', \'model_dir\', \'label_dimension\', \'weight_column\', \'n_trees\', \'max_depth\', \'learning_rate\', \'l1_regularization\', \'l2_regularization\', \'tree_complexity\', \'min_node_weight\', \'config\', \'center_bias\', \'pruning_mode\'], varargs=None, keywords=None, defaults=[\'None\', \'<object object instance>\', \'None\', \'100\', \'6\', \'0.1\', \'0.0\', \'0.0\', \'0.0\', \'0.0\', \'None\', \'False\', \'none\'], "
+    argspec: "args=[\'self\', \'feature_columns\', \'n_batches_per_layer\', \'model_dir\', \'label_dimension\', \'weight_column\', \'n_trees\', \'max_depth\', \'learning_rate\', \'l1_regularization\', \'l2_regularization\', \'tree_complexity\', \'min_node_weight\', \'config\', \'center_bias\', \'pruning_mode\', \'quantile_sketch_epsilon\'], varargs=None, keywords=None, defaults=[\'None\', \'<object object instance>\', \'None\', \'100\', \'6\', \'0.1\', \'0.0\', \'0.0\', \'0.0\', \'0.0\', \'None\', \'False\', \'none\', \'0.01\'], "
   }
   member_method {
     name: "eval_dir"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-estimator.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-estimator.pbtxt
new file mode 100644
index 00000000000..4635a1544c3
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-estimator.pbtxt
@@ -0,0 +1,62 @@
+path: "tensorflow.estimator.DNNEstimator"
+tf_class {
+  is_instance: "<class \'tensorflow_estimator.python.estimator.canned.dnn.DNNEstimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.Estimator\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "config"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "model_dir"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "model_fn"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "params"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'head\', \'hidden_units\', \'feature_columns\', \'model_dir\', \'optimizer\', \'activation_fn\', \'dropout\', \'input_layer_partitioner\', \'config\', \'warm_start_from\', \'batch_norm\'], varargs=None, keywords=None, defaults=[\'None\', \'Adagrad\', \'<function relu instance>\', \'None\', \'None\', \'None\', \'None\', \'False\'], "
+  }
+  member_method {
+    name: "eval_dir"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "evaluate"
+    argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "export_saved_model"
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "export_savedmodel"
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'strip_default_attrs\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'False\'], "
+  }
+  member_method {
+    name: "get_variable_names"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_variable_value"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "latest_checkpoint"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "predict"
+    argspec: "args=[\'self\', \'input_fn\', \'predict_keys\', \'hooks\', \'checkpoint_path\', \'yield_single_examples\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\'], "
+  }
+  member_method {
+    name: "train"
+    argspec: "args=[\'self\', \'input_fn\', \'hooks\', \'steps\', \'max_steps\', \'saving_listeners\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-linear-classifier.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-linear-classifier.pbtxt
index 4b5de2e2450..4acbff2cfff 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-linear-classifier.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-linear-classifier.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.estimator.LinearClassifier"
 tf_class {
-  is_instance: "<class \'tensorflow_estimator.python.estimator.canned.linear.LinearClassifier\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.canned.linear.LinearClassifierV2\'>"
   is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.Estimator\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -21,7 +21,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'feature_columns\', \'model_dir\', \'n_classes\', \'weight_column\', \'label_vocabulary\', \'optimizer\', \'config\', \'partitioner\', \'warm_start_from\', \'loss_reduction\', \'sparse_combiner\'], varargs=None, keywords=None, defaults=[\'None\', \'2\', \'None\', \'None\', \'Ftrl\', \'None\', \'None\', \'None\', \'weighted_sum\', \'sum\'], "
+    argspec: "args=[\'self\', \'feature_columns\', \'model_dir\', \'n_classes\', \'weight_column\', \'label_vocabulary\', \'optimizer\', \'config\', \'partitioner\', \'warm_start_from\', \'loss_reduction\', \'sparse_combiner\'], varargs=None, keywords=None, defaults=[\'None\', \'2\', \'None\', \'None\', \'Ftrl\', \'None\', \'None\', \'None\', \'weighted_sum_over_batch_size\', \'sum\'], "
   }
   member_method {
     name: "eval_dir"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-linear-estimator.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-linear-estimator.pbtxt
new file mode 100644
index 00000000000..3d6b03098aa
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-linear-estimator.pbtxt
@@ -0,0 +1,62 @@
+path: "tensorflow.estimator.LinearEstimator"
+tf_class {
+  is_instance: "<class \'tensorflow_estimator.python.estimator.canned.linear.LinearEstimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.Estimator\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "config"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "model_dir"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "model_fn"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "params"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'head\', \'feature_columns\', \'model_dir\', \'optimizer\', \'config\', \'partitioner\', \'sparse_combiner\'], varargs=None, keywords=None, defaults=[\'None\', \'Ftrl\', \'None\', \'None\', \'sum\'], "
+  }
+  member_method {
+    name: "eval_dir"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "evaluate"
+    argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "export_saved_model"
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "export_savedmodel"
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'strip_default_attrs\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'False\'], "
+  }
+  member_method {
+    name: "get_variable_names"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_variable_value"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "latest_checkpoint"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "predict"
+    argspec: "args=[\'self\', \'input_fn\', \'predict_keys\', \'hooks\', \'checkpoint_path\', \'yield_single_examples\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\'], "
+  }
+  member_method {
+    name: "train"
+    argspec: "args=[\'self\', \'input_fn\', \'hooks\', \'steps\', \'max_steps\', \'saving_listeners\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.experimental.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.experimental.pbtxt
index 862761a96c7..cabca3e883f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.experimental.pbtxt
@@ -16,4 +16,12 @@ tf_module {
     name: "linear_logit_fn_builder"
     argspec: "args=[\'units\', \'feature_columns\', \'sparse_combiner\'], varargs=None, keywords=None, defaults=[\'sum\'], "
   }
+  member_method {
+    name: "make_early_stopping_hook"
+    argspec: "args=[\'estimator\', \'should_stop_fn\', \'run_every_secs\', \'run_every_steps\'], varargs=None, keywords=None, defaults=[\'60\', \'None\'], "
+  }
+  member_method {
+    name: "stop_if_higher_hook"
+    argspec: "args=[\'estimator\', \'metric_name\', \'threshold\', \'eval_dir\', \'min_steps\', \'run_every_secs\', \'run_every_steps\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'60\', \'None\'], "
+  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.pbtxt
index ec3216ae705..c5b0085b8d3 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.pbtxt
@@ -28,6 +28,10 @@ tf_module {
     name: "DNNClassifier"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "DNNEstimator"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "DNNLinearCombinedClassifier"
     mtype: "<type \'type\'>"
@@ -72,6 +76,10 @@ tf_module {
     name: "LinearClassifier"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "LinearEstimator"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "LinearRegressor"
     mtype: "<type \'type\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.experimental.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.experimental.pbtxt
new file mode 100644
index 00000000000..0c3f04e468c
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.experimental.pbtxt
@@ -0,0 +1,7 @@
+path: "tensorflow.experimental"
+tf_module {
+  member_method {
+    name: "function_executor_type"
+    argspec: "args=[\'executor_type\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.graph_util.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.graph_util.pbtxt
index 162ee76ee7f..d0facad3809 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.graph_util.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.graph_util.pbtxt
@@ -1,27 +1,7 @@
 path: "tensorflow.graph_util"
 tf_module {
-  member_method {
-    name: "convert_variables_to_constants"
-    argspec: "args=[\'sess\', \'input_graph_def\', \'output_node_names\', \'variable_names_whitelist\', \'variable_names_blacklist\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "extract_sub_graph"
-    argspec: "args=[\'graph_def\', \'dest_nodes\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "import_graph_def"
     argspec: "args=[\'graph_def\', \'input_map\', \'return_elements\', \'name\', \'op_dict\', \'producer_op_list\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
-  member_method {
-    name: "must_run_on_cpu"
-    argspec: "args=[\'node\', \'pin_variables_on_cpu\'], varargs=None, keywords=None, defaults=[\'False\'], "
-  }
-  member_method {
-    name: "remove_training_nodes"
-    argspec: "args=[\'input_graph\', \'protected_nodes\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "tensor_shape_from_node_def_name"
-    argspec: "args=[\'graph\', \'input_name\'], varargs=None, keywords=None, defaults=None"
-  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.initializers.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.initializers.pbtxt
index d49181714fe..e3c63fe737e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.initializers.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.initializers.pbtxt
@@ -64,8 +64,4 @@ tf_module {
     name: "lecun_uniform"
     argspec: "args=[\'seed\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "tables_initializer"
-    argspec: "args=[\'name\'], varargs=None, keywords=None, defaults=[\'init_all_tables\'], "
-  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.io.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.io.pbtxt
index dccf136788d..64b63ed1a4a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.io.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.io.pbtxt
@@ -112,6 +112,10 @@ tf_module {
     name: "serialize_sparse"
     argspec: "args=[\'sp_input\', \'name\', \'out_type\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'string\'>\"], "
   }
+  member_method {
+    name: "serialize_tensor"
+    argspec: "args=[\'tensor\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "tf_record_iterator"
     argspec: "args=[\'path\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.-model.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.-model.pbtxt
index 4eb7f8f2a71..8ccba990bdd 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.-model.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.-model.pbtxt
@@ -89,10 +89,6 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "uses_learning_phase"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.-sequential.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.-sequential.pbtxt
index 524455be8bb..27aa91a6452 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.-sequential.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.-sequential.pbtxt
@@ -90,10 +90,6 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "uses_learning_phase"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.backend.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.backend.pbtxt
index 13a5d25e06f..b30778b2a08 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.backend.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.backend.pbtxt
@@ -182,7 +182,7 @@ tf_module {
   }
   member_method {
     name: "function"
-    argspec: "args=[\'inputs\', \'outputs\', \'updates\'], varargs=None, keywords=kwargs, defaults=[\'None\'], "
+    argspec: "args=[\'inputs\', \'outputs\', \'updates\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "gather"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-base-logger.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-base-logger.pbtxt
index 9eee9b37896..7d298e95135 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-base-logger.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-base-logger.pbtxt
@@ -23,6 +23,14 @@ tf_class {
     name: "on_epoch_end"
     argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "on_train_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_train_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "on_train_begin"
     argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-c-s-v-logger.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-c-s-v-logger.pbtxt
index 5bb949c5bb6..133205ab88b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-c-s-v-logger.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-c-s-v-logger.pbtxt
@@ -23,6 +23,14 @@ tf_class {
     name: "on_epoch_end"
     argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "on_train_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_train_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "on_train_begin"
     argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-callback.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-callback.pbtxt
index a5340d52c1a..d766c09ac5e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-callback.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-callback.pbtxt
@@ -22,6 +22,14 @@ tf_class {
     name: "on_epoch_end"
     argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "on_train_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_train_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "on_train_begin"
     argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-early-stopping.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-early-stopping.pbtxt
index ed0f37647f4..605f74e5602 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-early-stopping.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-early-stopping.pbtxt
@@ -27,6 +27,14 @@ tf_class {
     name: "on_epoch_end"
     argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "on_train_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_train_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "on_train_begin"
     argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-history.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-history.pbtxt
index ee400b31c43..cd893e67269 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-history.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-history.pbtxt
@@ -23,6 +23,14 @@ tf_class {
     name: "on_epoch_end"
     argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "on_train_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_train_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "on_train_begin"
     argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-lambda-callback.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-lambda-callback.pbtxt
index df8d7b0ef7a..50f2054cabb 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-lambda-callback.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-lambda-callback.pbtxt
@@ -23,6 +23,14 @@ tf_class {
     name: "on_epoch_end"
     argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "on_train_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_train_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "on_train_begin"
     argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-learning-rate-scheduler.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-learning-rate-scheduler.pbtxt
index ce1a9b694d8..9ed9db0a89b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-learning-rate-scheduler.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-learning-rate-scheduler.pbtxt
@@ -23,6 +23,14 @@ tf_class {
     name: "on_epoch_end"
     argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "on_train_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_train_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "on_train_begin"
     argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-model-checkpoint.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-model-checkpoint.pbtxt
index 48bb24a0527..3d8d1363bb4 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-model-checkpoint.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-model-checkpoint.pbtxt
@@ -23,6 +23,14 @@ tf_class {
     name: "on_epoch_end"
     argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "on_train_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_train_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "on_train_begin"
     argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-progbar-logger.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-progbar-logger.pbtxt
index d8bb8b2a7d0..5012f1517d5 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-progbar-logger.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-progbar-logger.pbtxt
@@ -23,6 +23,14 @@ tf_class {
     name: "on_epoch_end"
     argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "on_train_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_train_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "on_train_begin"
     argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-reduce-l-r-on-plateau.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-reduce-l-r-on-plateau.pbtxt
index dc27af9552a..73652c2b612 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-reduce-l-r-on-plateau.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-reduce-l-r-on-plateau.pbtxt
@@ -27,6 +27,14 @@ tf_class {
     name: "on_epoch_end"
     argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "on_train_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_train_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "on_train_begin"
     argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-remote-monitor.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-remote-monitor.pbtxt
index 5a3b791c0ad..24db71de118 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-remote-monitor.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-remote-monitor.pbtxt
@@ -23,6 +23,14 @@ tf_class {
     name: "on_epoch_end"
     argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "on_train_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_train_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "on_train_begin"
     argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-tensor-board.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-tensor-board.pbtxt
index e9d53b7225a..c5503c69a5f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-tensor-board.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-tensor-board.pbtxt
@@ -23,6 +23,14 @@ tf_class {
     name: "on_epoch_end"
     argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "on_train_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_train_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "on_train_begin"
     argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-terminate-on-na-n.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-terminate-on-na-n.pbtxt
index 5c2d336353a..de6e8ef0725 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-terminate-on-na-n.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-terminate-on-na-n.pbtxt
@@ -23,6 +23,14 @@ tf_class {
     name: "on_epoch_end"
     argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "on_train_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_train_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "on_train_begin"
     argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-model.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-model.pbtxt
index dbd88939b7f..ccff809f2b2 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-model.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-model.pbtxt
@@ -89,10 +89,6 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "uses_learning_phase"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-sequential.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-sequential.pbtxt
index 3c36f0702f8..b0fc7f97f1d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-sequential.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-sequential.pbtxt
@@ -90,10 +90,6 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "uses_learning_phase"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.pbtxt
index cbab7ce6314..1a4098d121b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.pbtxt
@@ -136,6 +136,10 @@ tf_module {
     name: "matmul"
     argspec: "args=[\'a\', \'b\', \'transpose_a\', \'transpose_b\', \'adjoint_a\', \'adjoint_b\', \'a_is_sparse\', \'b_is_sparse\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'False\', \'False\', \'False\', \'False\', \'None\'], "
   }
+  member_method {
+    name: "matvec"
+    argspec: "args=[\'a\', \'b\', \'transpose_a\', \'adjoint_a\', \'a_is_sparse\', \'b_is_sparse\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'False\', \'False\', \'None\'], "
+  }
   member_method {
     name: "norm"
     argspec: "args=[\'tensor\', \'ord\', \'axis\', \'keepdims\', \'name\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'euclidean\', \'None\', \'None\', \'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.math.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.math.pbtxt
index a45c0a3259e..a441e42b0a9 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.math.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.math.pbtxt
@@ -30,11 +30,11 @@ tf_module {
   }
   member_method {
     name: "argmax"
-    argspec: "args=[\'input\', \'axis\', \'name\', \'dimension\', \'output_type\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \"<dtype: \'int64\'>\"], "
+    argspec: "args=[\'input\', \'axis\', \'output_type\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'int64\'>\", \'None\'], "
   }
   member_method {
     name: "argmin"
-    argspec: "args=[\'input\', \'axis\', \'name\', \'dimension\', \'output_type\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \"<dtype: \'int64\'>\"], "
+    argspec: "args=[\'input\', \'axis\', \'output_type\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'int64\'>\", \'None\'], "
   }
   member_method {
     name: "asin"
@@ -296,10 +296,18 @@ tf_module {
     name: "reduce_prod"
     argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\', \'reduction_indices\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
+  member_method {
+    name: "reduce_std"
+    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
   member_method {
     name: "reduce_sum"
     argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\', \'reduction_indices\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
+  member_method {
+    name: "reduce_variance"
+    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
   member_method {
     name: "rint"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.nn.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.nn.pbtxt
index 8e9994e54a9..2dc5c48aa6e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.nn.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.nn.pbtxt
@@ -128,10 +128,6 @@ tf_module {
     name: "dropout"
     argspec: "args=[\'x\', \'keep_prob\', \'noise_shape\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
-  member_method {
-    name: "dynamic_rnn"
-    argspec: "args=[\'cell\', \'inputs\', \'sequence_length\', \'initial_state\', \'dtype\', \'parallel_iterations\', \'swap_memory\', \'time_major\', \'scope\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'False\', \'False\', \'None\'], "
-  }
   member_method {
     name: "elu"
     argspec: "args=[\'features\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -244,10 +240,6 @@ tf_module {
     name: "quantized_relu_x"
     argspec: "args=[\'features\', \'max_value\', \'min_features\', \'max_features\', \'out_type\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'quint8\'>\", \'None\'], "
   }
-  member_method {
-    name: "raw_rnn"
-    argspec: "args=[\'cell\', \'loop_fn\', \'parallel_iterations\', \'swap_memory\', \'scope\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
-  }
   member_method {
     name: "relu"
     argspec: "args=[\'features\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -316,10 +308,6 @@ tf_module {
     name: "static_bidirectional_rnn"
     argspec: "args=[\'cell_fw\', \'cell_bw\', \'inputs\', \'initial_state_fw\', \'initial_state_bw\', \'dtype\', \'sequence_length\', \'scope\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
-  member_method {
-    name: "static_rnn"
-    argspec: "args=[\'cell\', \'inputs\', \'initial_state\', \'dtype\', \'sequence_length\', \'scope\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
-  }
   member_method {
     name: "static_state_saving_rnn"
     argspec: "args=[\'cell\', \'inputs\', \'state_saver\', \'state_name\', \'sequence_length\', \'scope\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-basic-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-basic-l-s-t-m-cell.pbtxt
deleted file mode 100644
index 88b8f37c4ff..00000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-basic-l-s-t-m-cell.pbtxt
+++ /dev/null
@@ -1,202 +0,0 @@
-path: "tensorflow.nn.rnn_cell.BasicLSTMCell"
-tf_class {
-  is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.BasicLSTMCell\'>"
-  is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.LayerRNNCell\'>"
-  is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
-  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "activity_regularizer"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "dtype"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "inbound_nodes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input_mask"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "losses"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "non_trainable_variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "non_trainable_weights"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "outbound_nodes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_mask"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_size"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "state_size"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "trainable_variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "trainable_weights"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "updates"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "weights"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'num_units\', \'forget_bias\', \'state_is_tuple\', \'activation\', \'reuse\', \'name\', \'dtype\'], varargs=None, keywords=kwargs, defaults=[\'1.0\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "add_update"
-    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "add_variable"
-    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
-  }
-  member_method {
-    name: "apply"
-    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "build"
-    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "call"
-    argspec: "args=[\'self\', \'inputs\', \'state\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "compute_mask"
-    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "compute_output_shape"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "count_params"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_config"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_initial_state"
-    argspec: "args=[\'self\', \'inputs\', \'batch_size\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "get_input_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_mask_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_shape_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_losses_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_mask_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_shape_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_updates_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_weights"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "set_weights"
-    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "zero_state"
-    argspec: "args=[\'self\', \'batch_size\', \'dtype\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-g-r-u-cell.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-g-r-u-cell.pbtxt
deleted file mode 100644
index a4bb3219c79..00000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-g-r-u-cell.pbtxt
+++ /dev/null
@@ -1,202 +0,0 @@
-path: "tensorflow.nn.rnn_cell.GRUCell"
-tf_class {
-  is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.GRUCell\'>"
-  is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.LayerRNNCell\'>"
-  is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
-  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "activity_regularizer"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "dtype"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "inbound_nodes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input_mask"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "losses"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "non_trainable_variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "non_trainable_weights"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "outbound_nodes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_mask"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_size"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "state_size"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "trainable_variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "trainable_weights"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "updates"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "weights"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'num_units\', \'activation\', \'reuse\', \'kernel_initializer\', \'bias_initializer\', \'name\', \'dtype\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "add_update"
-    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "add_variable"
-    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
-  }
-  member_method {
-    name: "apply"
-    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "build"
-    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "call"
-    argspec: "args=[\'self\', \'inputs\', \'state\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "compute_mask"
-    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "compute_output_shape"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "count_params"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_config"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_initial_state"
-    argspec: "args=[\'self\', \'inputs\', \'batch_size\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "get_input_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_mask_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_shape_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_losses_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_mask_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_shape_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_updates_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_weights"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "set_weights"
-    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "zero_state"
-    argspec: "args=[\'self\', \'batch_size\', \'dtype\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-l-s-t-m-cell.pbtxt
deleted file mode 100644
index 715bfd5fc7c..00000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-l-s-t-m-cell.pbtxt
+++ /dev/null
@@ -1,202 +0,0 @@
-path: "tensorflow.nn.rnn_cell.LSTMCell"
-tf_class {
-  is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.LSTMCell\'>"
-  is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.LayerRNNCell\'>"
-  is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
-  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "activity_regularizer"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "dtype"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "inbound_nodes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input_mask"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "losses"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "non_trainable_variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "non_trainable_weights"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "outbound_nodes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_mask"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_size"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "state_size"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "trainable_variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "trainable_weights"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "updates"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "weights"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'num_units\', \'use_peepholes\', \'cell_clip\', \'initializer\', \'num_proj\', \'proj_clip\', \'num_unit_shards\', \'num_proj_shards\', \'forget_bias\', \'state_is_tuple\', \'activation\', \'reuse\', \'name\', \'dtype\'], varargs=None, keywords=kwargs, defaults=[\'False\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'1.0\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "add_update"
-    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "add_variable"
-    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
-  }
-  member_method {
-    name: "apply"
-    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "build"
-    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "call"
-    argspec: "args=[\'self\', \'inputs\', \'state\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "compute_mask"
-    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "compute_output_shape"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "count_params"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_config"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_initial_state"
-    argspec: "args=[\'self\', \'inputs\', \'batch_size\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "get_input_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_mask_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_shape_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_losses_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_mask_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_shape_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_updates_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_weights"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "set_weights"
-    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "zero_state"
-    argspec: "args=[\'self\', \'batch_size\', \'dtype\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.pbtxt
index 24767e250f9..3c78b07b394 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.pbtxt
@@ -1,9 +1,5 @@
 path: "tensorflow.nn.rnn_cell"
 tf_module {
-  member {
-    name: "BasicLSTMCell"
-    mtype: "<type \'type\'>"
-  }
   member {
     name: "DeviceWrapper"
     mtype: "<type \'type\'>"
@@ -12,14 +8,6 @@ tf_module {
     name: "DropoutWrapper"
     mtype: "<type \'type\'>"
   }
-  member {
-    name: "GRUCell"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "LSTMCell"
-    mtype: "<type \'type\'>"
-  }
   member {
     name: "LSTMStateTuple"
     mtype: "<type \'type\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.pbtxt
index 311e12e6d25..078b471a4c6 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.pbtxt
@@ -40,14 +40,6 @@ tf_module {
     name: "FIFOQueue"
     mtype: "<type \'type\'>"
   }
-  member {
-    name: "FixedLenFeature"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "FixedLenSequenceFeature"
-    mtype: "<type \'type\'>"
-  }
   member {
     name: "GPUOptions"
     mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
@@ -120,10 +112,6 @@ tf_module {
     name: "SessionLog"
     mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
   }
-  member {
-    name: "SparseFeature"
-    mtype: "<type \'type\'>"
-  }
   member {
     name: "SparseTensor"
     mtype: "<type \'type\'>"
@@ -160,10 +148,6 @@ tf_module {
     name: "UnconnectedGradients"
     mtype: "<class \'enum.EnumMeta\'>"
   }
-  member {
-    name: "VarLenFeature"
-    mtype: "<type \'type\'>"
-  }
   member {
     name: "Variable"
     mtype: "<class \'tensorflow.python.ops.variables.VariableMetaclass\'>"
@@ -216,10 +200,6 @@ tf_module {
     name: "debugging"
     mtype: "<type \'module\'>"
   }
-  member {
-    name: "distributions"
-    mtype: "<type \'module\'>"
-  }
   member {
     name: "double"
     mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
@@ -236,6 +216,10 @@ tf_module {
     name: "estimator"
     mtype: "<type \'module\'>"
   }
+  member {
+    name: "experimental"
+    mtype: "<type \'module\'>"
+  }
   member {
     name: "feature_column"
     mtype: "<type \'module\'>"
@@ -392,10 +376,6 @@ tf_module {
     name: "resource"
     mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
   }
-  member {
-    name: "resource_loader"
-    mtype: "<type \'module\'>"
-  }
   member {
     name: "saved_model"
     mtype: "<type \'module\'>"
@@ -404,6 +384,10 @@ tf_module {
     name: "sets"
     mtype: "<type \'module\'>"
   }
+  member {
+    name: "signal"
+    mtype: "<type \'module\'>"
+  }
   member {
     name: "sparse"
     mtype: "<type \'module\'>"
@@ -456,10 +440,6 @@ tf_module {
     name: "uint8"
     mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
   }
-  member {
-    name: "user_ops"
-    mtype: "<type \'module\'>"
-  }
   member {
     name: "variant"
     mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
@@ -476,14 +456,6 @@ tf_module {
     name: "Assert"
     argspec: "args=[\'condition\', \'data\', \'summarize\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
-  member_method {
-    name: "NoGradient"
-    argspec: "args=[\'op_type\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "NotDifferentiable"
-    argspec: "args=[\'op_type\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "abs"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -500,22 +472,10 @@ tf_module {
     name: "add"
     argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "add_check_numerics_ops"
-    argspec: "args=[], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "add_n"
     argspec: "args=[\'inputs\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "add_to_collection"
-    argspec: "args=[\'name\', \'value\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "add_to_collections"
-    argspec: "args=[\'names\', \'value\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "arg_max"
     argspec: "args=[\'input\', \'dimension\', \'output_type\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'int64\'>\", \'None\'], "
@@ -526,11 +486,11 @@ tf_module {
   }
   member_method {
     name: "argmax"
-    argspec: "args=[\'input\', \'axis\', \'name\', \'dimension\', \'output_type\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \"<dtype: \'int64\'>\"], "
+    argspec: "args=[\'input\', \'axis\', \'output_type\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'int64\'>\", \'None\'], "
   }
   member_method {
     name: "argmin"
-    argspec: "args=[\'input\', \'axis\', \'name\', \'dimension\', \'output_type\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \"<dtype: \'int64\'>\"], "
+    argspec: "args=[\'input\', \'axis\', \'output_type\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'int64\'>\", \'None\'], "
   }
   member_method {
     name: "as_dtype"
@@ -598,7 +558,7 @@ tf_module {
   }
   member_method {
     name: "boolean_mask"
-    argspec: "args=[\'tensor\', \'mask\', \'name\', \'axis\'], varargs=None, keywords=None, defaults=[\'boolean_mask\', \'None\'], "
+    argspec: "args=[\'tensor\', \'mask\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'boolean_mask\'], "
   }
   member_method {
     name: "broadcast_dynamic_shape"
@@ -652,25 +612,13 @@ tf_module {
     name: "constant"
     argspec: "args=[\'value\', \'dtype\', \'shape\', \'name\', \'verify_shape\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'Const\', \'False\'], "
   }
-  member_method {
-    name: "container"
-    argspec: "args=[\'container_name\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "control_dependencies"
     argspec: "args=[\'control_inputs\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "convert_to_tensor"
-    argspec: "args=[\'value\', \'dtype\', \'name\', \'preferred_dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "convert_to_tensor_or_indexed_slices"
-    argspec: "args=[\'value\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "convert_to_tensor_or_sparse_tensor"
-    argspec: "args=[\'value\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'value\', \'dtype\', \'dtype_hint\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "cos"
@@ -698,7 +646,7 @@ tf_module {
   }
   member_method {
     name: "device"
-    argspec: "args=[\'device_name_or_function\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'device_name\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "div"
@@ -760,14 +708,6 @@ tf_module {
     name: "eye"
     argspec: "args=[\'num_rows\', \'num_columns\', \'batch_shape\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \"<dtype: \'float32\'>\", \'None\'], "
   }
-  member_method {
-    name: "fft2d"
-    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "fft3d"
-    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
   member_method {
     name: "fill"
     argspec: "args=[\'dims\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -796,6 +736,10 @@ tf_module {
     name: "foldr"
     argspec: "args=[\'fn\', \'elems\', \'initializer\', \'parallel_iterations\', \'back_prop\', \'swap_memory\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'True\', \'False\', \'None\'], "
   }
+  member_method {
+    name: "function"
+    argspec: "args=[\'func\', \'input_signature\', \'autograph\', \'experimental_autograph_options\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\', \'None\'], "
+  }
   member_method {
     name: "gather"
     argspec: "args=[\'params\', \'indices\', \'validate_indices\', \'name\', \'axis\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'0\'], "
@@ -856,14 +800,6 @@ tf_module {
     name: "identity_n"
     argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "ifft2d"
-    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "ifft3d"
-    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
   member_method {
     name: "import_graph_def"
     argspec: "args=[\'graph_def\', \'input_map\', \'return_elements\', \'name\', \'op_dict\', \'producer_op_list\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
@@ -872,10 +808,6 @@ tf_module {
     name: "init_scope"
     argspec: "args=[], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "initialize_all_tables"
-    argspec: "args=[\'name\'], varargs=None, keywords=None, defaults=[\'init_all_tables\'], "
-  }
   member_method {
     name: "less"
     argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -928,14 +860,6 @@ tf_module {
     name: "make_ndarray"
     argspec: "args=[\'tensor\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "make_template"
-    argspec: "args=[\'name_\', \'func_\', \'create_scope_now_\', \'unique_name_\', \'custom_getter_\'], varargs=None, keywords=kwargs, defaults=[\'False\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "make_tensor_proto"
-    argspec: "args=[\'values\', \'dtype\', \'shape\', \'verify_shape\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\'], "
-  }
   member_method {
     name: "map_fn"
     argspec: "args=[\'fn\', \'elems\', \'dtype\', \'parallel_iterations\', \'back_prop\', \'swap_memory\', \'infer_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'True\', \'False\', \'True\', \'None\'], "
@@ -980,6 +904,10 @@ tf_module {
     name: "negative"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "no_gradient"
+    argspec: "args=[\'op_type\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "no_op"
     argspec: "args=[\'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -1008,10 +936,6 @@ tf_module {
     name: "ones_like"
     argspec: "args=[\'tensor\', \'dtype\', \'name\', \'optimize\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'True\'], "
   }
-  member_method {
-    name: "op_scope"
-    argspec: "args=[\'values\', \'name\', \'default_name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
   member_method {
     name: "pad"
     argspec: "args=[\'tensor\', \'paddings\', \'mode\', \'name\', \'constant_values\'], varargs=None, keywords=None, defaults=[\'CONSTANT\', \'None\', \'0\'], "
@@ -1020,26 +944,6 @@ tf_module {
     name: "parallel_stack"
     argspec: "args=[\'values\', \'name\'], varargs=None, keywords=None, defaults=[\'parallel_stack\'], "
   }
-  member_method {
-    name: "parse_example"
-    argspec: "args=[\'serialized\', \'features\', \'name\', \'example_names\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "parse_single_example"
-    argspec: "args=[\'serialized\', \'features\', \'name\', \'example_names\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "parse_single_sequence_example"
-    argspec: "args=[\'serialized\', \'context_features\', \'sequence_features\', \'example_name\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "placeholder"
-    argspec: "args=[\'dtype\', \'shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "placeholder_with_default"
-    argspec: "args=[\'input\', \'shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
   member_method {
     name: "pow"
     argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -1049,29 +953,13 @@ tf_module {
     argspec: "args=[], varargs=inputs, keywords=kwargs, defaults=None"
   }
   member_method {
-    name: "py_func"
-    argspec: "args=[\'func\', \'inp\', \'Tout\', \'stateful\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
+    name: "py_function"
+    argspec: "args=[\'func\', \'inp\', \'Tout\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "quantize_v2"
     argspec: "args=[\'input\', \'min_range\', \'max_range\', \'T\', \'mode\', \'name\', \'round_mode\'], varargs=None, keywords=None, defaults=[\'MIN_COMBINED\', \'None\', \'HALF_AWAY_FROM_ZERO\'], "
   }
-  member_method {
-    name: "random_crop"
-    argspec: "args=[\'value\', \'size\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "random_normal"
-    argspec: "args=[\'shape\', \'mean\', \'stddev\', \'dtype\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'0.0\', \'1.0\', \"<dtype: \'float32\'>\", \'None\', \'None\'], "
-  }
-  member_method {
-    name: "random_shuffle"
-    argspec: "args=[\'value\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "random_uniform"
-    argspec: "args=[\'shape\', \'minval\', \'maxval\', \'dtype\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'None\', \"<dtype: \'float32\'>\", \'None\', \'None\'], "
-  }
   member_method {
     name: "range"
     argspec: "args=[\'start\', \'limit\', \'delta\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'None\', \'range\'], "
@@ -1188,18 +1076,10 @@ tf_module {
     name: "sequence_mask"
     argspec: "args=[\'lengths\', \'maxlen\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'bool\'>\", \'None\'], "
   }
-  member_method {
-    name: "serialize_tensor"
-    argspec: "args=[\'tensor\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
   member_method {
     name: "set_random_seed"
     argspec: "args=[\'seed\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "setdiff1d"
-    argspec: "args=[\'x\', \'y\', \'index_dtype\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'int32\'>\", \'None\'], "
-  }
   member_method {
     name: "shape"
     argspec: "args=[\'input\', \'name\', \'out_type\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'int32\'>\"], "
@@ -1240,10 +1120,6 @@ tf_module {
     name: "sparse_concat"
     argspec: "args=[\'axis\', \'sp_inputs\', \'name\', \'expand_nonconcat_dim\', \'concat_dim\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
   }
-  member_method {
-    name: "sparse_matmul"
-    argspec: "args=[\'a\', \'b\', \'transpose_a\', \'transpose_b\', \'a_is_sparse\', \'b_is_sparse\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'False\', \'False\', \'None\'], "
-  }
   member_method {
     name: "sparse_reduce_max"
     argspec: "args=[\'sp_input\', \'axis\', \'keepdims\', \'reduction_axes\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
@@ -1308,10 +1184,6 @@ tf_module {
     name: "subtract"
     argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "tables_initializer"
-    argspec: "args=[\'name\'], varargs=None, keywords=None, defaults=[\'init_all_tables\'], "
-  }
   member_method {
     name: "tan"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -1332,34 +1204,6 @@ tf_module {
     name: "timestamp"
     argspec: "args=[\'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "to_bfloat16"
-    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'ToBFloat16\'], "
-  }
-  member_method {
-    name: "to_complex128"
-    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'ToComplex128\'], "
-  }
-  member_method {
-    name: "to_complex64"
-    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'ToComplex64\'], "
-  }
-  member_method {
-    name: "to_double"
-    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'ToDouble\'], "
-  }
-  member_method {
-    name: "to_float"
-    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'ToFloat\'], "
-  }
-  member_method {
-    name: "to_int32"
-    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'ToInt32\'], "
-  }
-  member_method {
-    name: "to_int64"
-    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'ToInt64\'], "
-  }
   member_method {
     name: "transpose"
     argspec: "args=[\'a\', \'perm\', \'name\', \'conjugate\'], varargs=None, keywords=None, defaults=[\'None\', \'transpose\', \'False\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.resource_loader.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.resource_loader.pbtxt
deleted file mode 100644
index 288b78b4cd0..00000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.resource_loader.pbtxt
+++ /dev/null
@@ -1,23 +0,0 @@
-path: "tensorflow.resource_loader"
-tf_module {
-  member_method {
-    name: "get_data_files_path"
-    argspec: "args=[], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_path_to_datafile"
-    argspec: "args=[\'path\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_root_dir_with_all_resources"
-    argspec: "args=[], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "load_resource"
-    argspec: "args=[\'path\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "readahead_file_path"
-    argspec: "args=[\'path\', \'readahead\'], varargs=None, keywords=None, defaults=[\'128M\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.saved_model.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.saved_model.pbtxt
index 6b86d4b49f5..d57936a2f1c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.saved_model.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.saved_model.pbtxt
@@ -120,4 +120,8 @@ tf_module {
     name: "regression_signature_def"
     argspec: "args=[\'examples\', \'predictions\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "save"
+    argspec: "args=[\'obj\', \'export_dir\', \'signatures\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.signal.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.signal.pbtxt
new file mode 100644
index 00000000000..2c50c41f186
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.signal.pbtxt
@@ -0,0 +1,63 @@
+path: "tensorflow.signal"
+tf_module {
+  member_method {
+    name: "fft"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "fft2d"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "fft3d"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "frame"
+    argspec: "args=[\'signal\', \'frame_length\', \'frame_step\', \'pad_end\', \'pad_value\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'0\', \'-1\', \'None\'], "
+  }
+  member_method {
+    name: "hamming_window"
+    argspec: "args=[\'window_length\', \'periodic\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \"<dtype: \'float32\'>\", \'None\'], "
+  }
+  member_method {
+    name: "hann_window"
+    argspec: "args=[\'window_length\', \'periodic\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \"<dtype: \'float32\'>\", \'None\'], "
+  }
+  member_method {
+    name: "ifft"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ifft2d"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ifft3d"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "inverse_stft"
+    argspec: "args=[\'stfts\', \'frame_length\', \'frame_step\', \'fft_length\', \'window_fn\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'<function hann_window instance>\', \'None\'], "
+  }
+  member_method {
+    name: "inverse_stft_window_fn"
+    argspec: "args=[\'frame_step\', \'forward_window_fn\', \'name\'], varargs=None, keywords=None, defaults=[\'<function hann_window instance>\', \'None\'], "
+  }
+  member_method {
+    name: "linear_to_mel_weight_matrix"
+    argspec: "args=[\'num_mel_bins\', \'num_spectrogram_bins\', \'sample_rate\', \'lower_edge_hertz\', \'upper_edge_hertz\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'20\', \'129\', \'8000\', \'125.0\', \'3800.0\', \"<dtype: \'float32\'>\", \'None\'], "
+  }
+  member_method {
+    name: "mfccs_from_log_mel_spectrograms"
+    argspec: "args=[\'log_mel_spectrograms\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "overlap_and_add"
+    argspec: "args=[\'signal\', \'frame_step\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "stft"
+    argspec: "args=[\'signals\', \'frame_length\', \'frame_step\', \'fft_length\', \'window_fn\', \'pad_end\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'<function hann_window instance>\', \'False\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.sparse.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.sparse.pbtxt
index 32bd8d5f8ed..9c9c4d838e9 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.sparse.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.sparse.pbtxt
@@ -56,10 +56,6 @@ tf_module {
     name: "minimum"
     argspec: "args=[\'sp_a\', \'sp_b\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "placeholder"
-    argspec: "args=[\'dtype\', \'shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
   member_method {
     name: "reduce_max"
     argspec: "args=[\'sp_input\', \'axis\', \'keepdims\', \'reduction_axes\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.spectral.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.spectral.pbtxt
index 6a421ef12d5..b0f0783e300 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.spectral.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.spectral.pbtxt
@@ -4,34 +4,10 @@ tf_module {
     name: "dct"
     argspec: "args=[\'input\', \'type\', \'n\', \'axis\', \'norm\', \'name\'], varargs=None, keywords=None, defaults=[\'2\', \'None\', \'-1\', \'None\', \'None\'], "
   }
-  member_method {
-    name: "fft"
-    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "fft2d"
-    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "fft3d"
-    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
   member_method {
     name: "idct"
     argspec: "args=[\'input\', \'type\', \'n\', \'axis\', \'norm\', \'name\'], varargs=None, keywords=None, defaults=[\'2\', \'None\', \'-1\', \'None\', \'None\'], "
   }
-  member_method {
-    name: "ifft"
-    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "ifft2d"
-    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "ifft3d"
-    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
   member_method {
     name: "irfft"
     argspec: "args=[\'input_tensor\', \'fft_length\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.test.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.test.pbtxt
index dee536cd830..af3f06d8de3 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.test.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.test.pbtxt
@@ -36,10 +36,6 @@ tf_module {
     name: "create_local_cluster"
     argspec: "args=[\'num_workers\', \'num_ps\', \'protocol\', \'worker_config\', \'ps_config\'], varargs=None, keywords=None, defaults=[\'grpc\', \'None\', \'None\'], "
   }
-  member_method {
-    name: "get_temp_dir"
-    argspec: "args=[], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "gpu_device_name"
     argspec: "args=[], varargs=None, keywords=None, defaults=None"
@@ -56,8 +52,4 @@ tf_module {
     name: "main"
     argspec: "args=[\'argv\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "test_src_dir_path"
-    argspec: "args=[\'relative_path\'], varargs=None, keywords=None, defaults=None"
-  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-checkpoint-manager.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-checkpoint-manager.pbtxt
new file mode 100644
index 00000000000..2538de661b3
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.train.-checkpoint-manager.pbtxt
@@ -0,0 +1,21 @@
+path: "tensorflow.train.CheckpointManager"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.checkpoint_management.CheckpointManager\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "checkpoints"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "latest_checkpoint"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'checkpoint\', \'directory\', \'max_to_keep\', \'keep_checkpoint_every_n_hours\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "save"
+    argspec: "args=[\'self\', \'checkpoint_number\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-sync-replicas-optimizer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-sync-replicas-optimizer.pbtxt
deleted file mode 100644
index 2c0fda3c72b..00000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-sync-replicas-optimizer.pbtxt
+++ /dev/null
@@ -1,63 +0,0 @@
-path: "tensorflow.train.SyncReplicasOptimizer"
-tf_class {
-  is_instance: "<class \'tensorflow.python.training.sync_replicas_optimizer.SyncReplicasOptimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "GATE_GRAPH"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "GATE_NONE"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "GATE_OP"
-    mtype: "<type \'int\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'opt\', \'replicas_to_aggregate\', \'total_num_replicas\', \'variable_averages\', \'variables_to_average\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'False\', \'sync_replicas\'], "
-  }
-  member_method {
-    name: "apply_gradients"
-    argspec: "args=[\'self\', \'grads_and_vars\', \'global_step\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "compute_gradients"
-    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "get_chief_queue_runner"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_init_tokens_op"
-    argspec: "args=[\'self\', \'num_tokens\'], varargs=None, keywords=None, defaults=[\'-1\'], "
-  }
-  member_method {
-    name: "get_name"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_slot"
-    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "get_slot_names"
-    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "make_session_run_hook"
-    argspec: "args=[\'self\', \'is_chief\', \'num_tokens\'], varargs=None, keywords=None, defaults=[\'-1\'], "
-  }
-  member_method {
-    name: "minimize"
-    argspec: "args=[\'self\', \'loss\', \'global_step\', \'var_list\', \'gate_gradients\', \'aggregation_method\', \'colocate_gradients_with_ops\', \'name\', \'grad_loss\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'1\', \'None\', \'False\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "variables"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.pbtxt
index c2dc4140e8e..a091daa2985 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.train.pbtxt
@@ -24,6 +24,10 @@ tf_module {
     name: "Checkpoint"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "CheckpointManager"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "CheckpointSaverHook"
     mtype: "<type \'type\'>"
@@ -212,10 +216,6 @@ tf_module {
     name: "Supervisor"
     mtype: "<type \'type\'>"
   }
-  member {
-    name: "SyncReplicasOptimizer"
-    mtype: "<type \'type\'>"
-  }
   member {
     name: "VocabInfo"
     mtype: "<type \'type\'>"
@@ -224,26 +224,10 @@ tf_module {
     name: "WorkerSessionCreator"
     mtype: "<type \'type\'>"
   }
-  member_method {
-    name: "MonitoredTrainingSession"
-    argspec: "args=[\'master\', \'is_chief\', \'checkpoint_dir\', \'scaffold\', \'hooks\', \'chief_only_hooks\', \'save_checkpoint_secs\', \'save_summaries_steps\', \'save_summaries_secs\', \'config\', \'stop_grace_period_secs\', \'log_step_count_steps\', \'max_wait_secs\', \'save_checkpoint_steps\', \'summary_dir\'], varargs=None, keywords=None, defaults=[\'\', \'True\', \'None\', \'None\', \'None\', \'None\', \'<object object instance>\', \'<object object instance>\', \'<object object instance>\', \'None\', \'120\', \'100\', \'7200\', \'<object object instance>\', \'None\'], "
-  }
   member_method {
     name: "NewCheckpointReader"
     argspec: "args=[\'filepattern\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "assert_global_step"
-    argspec: "args=[\'global_step_tensor\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "basic_train_loop"
-    argspec: "args=[\'supervisor\', \'train_step_fn\', \'args\', \'kwargs\', \'master\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'\'], "
-  }
-  member_method {
-    name: "checkpoint_exists"
-    argspec: "args=[\'checkpoint_prefix\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "cosine_decay"
     argspec: "args=[\'learning_rate\', \'global_step\', \'decay_steps\', \'alpha\', \'name\'], varargs=None, keywords=None, defaults=[\'0.0\', \'None\'], "
@@ -252,38 +236,14 @@ tf_module {
     name: "cosine_decay_restarts"
     argspec: "args=[\'learning_rate\', \'global_step\', \'first_decay_steps\', \'t_mul\', \'m_mul\', \'alpha\', \'name\'], varargs=None, keywords=None, defaults=[\'2.0\', \'1.0\', \'0.0\', \'None\'], "
   }
-  member_method {
-    name: "create_global_step"
-    argspec: "args=[\'graph\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
   member_method {
     name: "exponential_decay"
     argspec: "args=[\'learning_rate\', \'global_step\', \'decay_steps\', \'decay_rate\', \'staircase\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
   }
-  member_method {
-    name: "generate_checkpoint_state_proto"
-    argspec: "args=[\'save_dir\', \'model_checkpoint_path\', \'all_model_checkpoint_paths\', \'all_model_checkpoint_timestamps\', \'last_preserved_timestamp\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "get_checkpoint_mtimes"
-    argspec: "args=[\'checkpoint_prefixes\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "get_checkpoint_state"
     argspec: "args=[\'checkpoint_dir\', \'latest_filename\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "get_global_step"
-    argspec: "args=[\'graph\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "get_or_create_global_step"
-    argspec: "args=[\'graph\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "global_step"
-    argspec: "args=[\'sess\', \'global_step_tensor\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "init_from_checkpoint"
     argspec: "args=[\'ckpt_dir_or_file\', \'assignment_map\'], varargs=None, keywords=None, defaults=None"
@@ -328,10 +288,6 @@ tf_module {
     name: "polynomial_decay"
     argspec: "args=[\'learning_rate\', \'global_step\', \'decay_steps\', \'end_learning_rate\', \'power\', \'cycle\', \'name\'], varargs=None, keywords=None, defaults=[\'0.0001\', \'1.0\', \'False\', \'None\'], "
   }
-  member_method {
-    name: "remove_checkpoint"
-    argspec: "args=[\'checkpoint_prefix\', \'checkpoint_format_version\', \'meta_graph_suffix\'], varargs=None, keywords=None, defaults=[\'2\', \'meta\'], "
-  }
   member_method {
     name: "replica_device_setter"
     argspec: "args=[\'ps_tasks\', \'ps_device\', \'worker_device\', \'merge_devices\', \'cluster\', \'ps_ops\', \'ps_strategy\'], varargs=None, keywords=None, defaults=[\'0\', \'/job:ps\', \'/job:worker\', \'True\', \'None\', \'None\', \'None\'], "
@@ -352,16 +308,8 @@ tf_module {
     name: "summary_iterator"
     argspec: "args=[\'path\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "update_checkpoint_state"
-    argspec: "args=[\'save_dir\', \'model_checkpoint_path\', \'all_model_checkpoint_paths\', \'latest_filename\', \'all_model_checkpoint_timestamps\', \'last_preserved_timestamp\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
-  }
   member_method {
     name: "warm_start"
     argspec: "args=[\'ckpt_to_initialize_from\', \'vars_to_warm_start\', \'var_name_to_vocab_info\', \'var_name_to_prev_var_name\'], varargs=None, keywords=None, defaults=[\'.*\', \'None\', \'None\'], "
   }
-  member_method {
-    name: "write_graph"
-    argspec: "args=[\'graph_or_graph_def\', \'logdir\', \'name\', \'as_text\'], varargs=None, keywords=None, defaults=[\'True\'], "
-  }
 }
diff --git a/tensorflow/tools/ci_build/Dockerfile.rbe.cuda9.0-cudnn7-ubuntu14.04 b/tensorflow/tools/ci_build/Dockerfile.rbe.cuda9.0-cudnn7-ubuntu14.04
index dd8d705331a..eb6ca7c8f0f 100644
--- a/tensorflow/tools/ci_build/Dockerfile.rbe.cuda9.0-cudnn7-ubuntu14.04
+++ b/tensorflow/tools/ci_build/Dockerfile.rbe.cuda9.0-cudnn7-ubuntu14.04
@@ -6,7 +6,7 @@
 # TODO(klimek): Include clang in this image so we can also target clang
 # builds.
 
-FROM ubuntu:14.04
+FROM gcr.io/clang-docker-builder/clang-ubuntu14_04
 LABEL maintainer="Manuel Klimek <klimek@google.com>"
 
 RUN apt-get update && apt-get install -y --no-install-recommends ca-certificates apt-transport-https gnupg-curl && \
@@ -71,6 +71,15 @@ RUN ln -s libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1
 RUN ln -s /usr/lib/x86_64-linux-gnu/libnccl.so /usr/lib/libnccl.so \
  && ln -s /usr/lib/x86_64-linux-gnu/libnccl.so /usr/lib/libnccl.so.2
 
+# Install a newer version of libstdc++, as new clang versions do not work
+# with the stock ubuntu 14.04 libstdc++.
+RUN apt-get update && \
+    apt-get install -y software-properties-common && \
+    add-apt-repository ppa:ubuntu-toolchain-r/test -y && \
+    apt-get update && \
+    apt-get install -y libstdc++-7-dev && \
+    rm -rf /var/lib/apt/lists/*
+
 # Copy and run the install scripts.
 COPY install/*.sh /install/
 ARG DEBIAN_FRONTEND=noninteractive
diff --git a/tensorflow/tools/compatibility/ast_edits.py b/tensorflow/tools/compatibility/ast_edits.py
index a5b9fbdae8b..56c67b83565 100644
--- a/tensorflow/tools/compatibility/ast_edits.py
+++ b/tensorflow/tools/compatibility/ast_edits.py
@@ -34,7 +34,7 @@ class APIChangeSpec(object):
 
   * `function_keyword_renames`: maps function names to a map of old -> new
     argument names
-  * `function_renames`: maps function names to new function names
+  * `symbol_renames`: maps function names to new function names
   * `change_to_function`: a set of function names that have changed (for
     notifications)
   * `function_reorders`: maps functions whose argument order has changed to the
@@ -176,9 +176,9 @@ class _ASTCallVisitor(ast.NodeVisitor):
     ast.NodeVisitor.generic_visit(self, node)
 
   def _rename_functions(self, node, full_name):
-    function_renames = self._api_change_spec.function_renames
+    symbol_renames = self._api_change_spec.symbol_renames
     try:
-      new_name = function_renames[full_name]
+      new_name = symbol_renames[full_name]
       self._file_edit.add("Renamed function %r to %r" % (full_name, new_name),
                           node.lineno, node.col_offset, full_name, new_name)
     except KeyError:
diff --git a/tensorflow/tools/compatibility/renames_v2.py b/tensorflow/tools/compatibility/renames_v2.py
index 260278878fa..5ea2fbcc4cd 100644
--- a/tensorflow/tools/compatibility/renames_v2.py
+++ b/tensorflow/tools/compatibility/renames_v2.py
@@ -25,14 +25,50 @@ from __future__ import division
 from __future__ import print_function
 
 renames = {
+    'tf.AUTO_REUSE': 'tf.compat.v1.AUTO_REUSE',
+    'tf.COMPILER_VERSION': 'tf.version.COMPILER_VERSION',
+    'tf.CXX11_ABI_FLAG': 'tf.sysconfig.CXX11_ABI_FLAG',
+    'tf.FixedLenFeature': 'tf.io.FixedLenFeature',
+    'tf.FixedLenSequenceFeature': 'tf.io.FixedLenSequenceFeature',
+    'tf.FixedLengthRecordReader': 'tf.compat.v1.FixedLengthRecordReader',
+    'tf.GIT_VERSION': 'tf.version.GIT_VERSION',
+    'tf.GRAPH_DEF_VERSION': 'tf.version.GRAPH_DEF_VERSION',
+    'tf.GRAPH_DEF_VERSION_MIN_CONSUMER': 'tf.version.GRAPH_DEF_VERSION_MIN_CONSUMER',
+    'tf.GRAPH_DEF_VERSION_MIN_PRODUCER': 'tf.version.GRAPH_DEF_VERSION_MIN_PRODUCER',
+    'tf.IdentityReader': 'tf.compat.v1.IdentityReader',
+    'tf.InteractiveSession': 'tf.compat.v1.InteractiveSession',
+    'tf.LMDBReader': 'tf.compat.v1.LMDBReader',
+    'tf.MONOLITHIC_BUILD': 'tf.sysconfig.MONOLITHIC_BUILD',
+    'tf.NoGradient': 'tf.no_gradient',
+    'tf.NotDifferentiable': 'tf.no_gradient',
     'tf.OpError': 'tf.errors.OpError',
     'tf.PaddingFIFOQueue': 'tf.io.PaddingFIFOQueue',
+    'tf.Print': 'tf.compat.v1.Print',
     'tf.PriorityQueue': 'tf.io.PriorityQueue',
+    'tf.QUANTIZED_DTYPES': 'tf.dtypes.QUANTIZED_DTYPES',
     'tf.QueueBase': 'tf.io.QueueBase',
     'tf.RandomShuffleQueue': 'tf.io.RandomShuffleQueue',
+    'tf.ReaderBase': 'tf.compat.v1.ReaderBase',
+    'tf.Session': 'tf.compat.v1.Session',
     'tf.SparseConditionalAccumulator': 'tf.sparse.SparseConditionalAccumulator',
+    'tf.SparseFeature': 'tf.io.SparseFeature',
+    'tf.TFRecordReader': 'tf.compat.v1.TFRecordReader',
+    'tf.TensorShape': 'tf.compat.v1.TensorShape',
+    'tf.TextLineReader': 'tf.compat.v1.TextLineReader',
+    'tf.VERSION': 'tf.version.VERSION',
+    'tf.VarLenFeature': 'tf.io.VarLenFeature',
+    'tf.Variable': 'tf.compat.v1.Variable',
+    'tf.VariableAggregation': 'tf.compat.v1.VariableAggregation',
+    'tf.VariableScope': 'tf.compat.v1.VariableScope',
+    'tf.WholeFileReader': 'tf.compat.v1.WholeFileReader',
     'tf.accumulate_n': 'tf.math.accumulate_n',
+    'tf.add_check_numerics_ops': 'tf.compat.v1.add_check_numerics_ops',
+    'tf.add_to_collection': 'tf.compat.v1.add_to_collection',
+    'tf.add_to_collections': 'tf.compat.v1.add_to_collections',
+    'tf.all_variables': 'tf.compat.v1.all_variables',
     'tf.angle': 'tf.math.angle',
+    'tf.argmax': 'tf.compat.v1.argmax',
+    'tf.argmin': 'tf.compat.v1.argmin',
     'tf.assert_greater_equal': 'tf.debugging.assert_greater_equal',
     'tf.assert_integer': 'tf.debugging.assert_integer',
     'tf.assert_less_equal': 'tf.debugging.assert_less_equal',
@@ -48,14 +84,24 @@ renames = {
     'tf.assert_same_float_dtype': 'tf.debugging.assert_same_float_dtype',
     'tf.assert_scalar': 'tf.debugging.assert_scalar',
     'tf.assert_type': 'tf.debugging.assert_type',
+    'tf.assert_variables_initialized': 'tf.compat.v1.assert_variables_initialized',
+    'tf.assign': 'tf.compat.v1.assign',
+    'tf.assign_add': 'tf.compat.v1.assign_add',
+    'tf.assign_sub': 'tf.compat.v1.assign_sub',
     'tf.betainc': 'tf.math.betainc',
     'tf.bincount': 'tf.math.bincount',
     'tf.ceil': 'tf.math.ceil',
     'tf.check_numerics': 'tf.debugging.check_numerics',
     'tf.cholesky': 'tf.linalg.cholesky',
     'tf.cholesky_solve': 'tf.linalg.cholesky_solve',
+    'tf.colocate_with': 'tf.compat.v1.colocate_with',
     'tf.confusion_matrix': 'tf.math.confusion_matrix',
     'tf.conj': 'tf.math.conj',
+    'tf.container': 'tf.compat.v1.container',
+    'tf.convert_to_tensor': 'tf.compat.v1.convert_to_tensor',
+    'tf.convert_to_tensor_or_indexed_slices': 'tf.compat.v1.convert_to_tensor_or_indexed_slices',
+    'tf.convert_to_tensor_or_sparse_tensor': 'tf.compat.v1.convert_to_tensor_or_sparse_tensor',
+    'tf.count_up_to': 'tf.compat.v1.count_up_to',
     'tf.cross': 'tf.linalg.cross',
     'tf.cumprod': 'tf.math.cumprod',
     'tf.decode_base64': 'tf.io.decode_base64',
@@ -63,12 +109,38 @@ renames = {
     'tf.decode_csv': 'tf.io.decode_csv',
     'tf.decode_json_example': 'tf.io.decode_json_example',
     'tf.decode_raw': 'tf.io.decode_raw',
+    'tf.delete_session_tensor': 'tf.compat.v1.delete_session_tensor',
     'tf.depth_to_space': 'tf.nn.depth_to_space',
     'tf.dequantize': 'tf.quantization.dequantize',
     'tf.deserialize_many_sparse': 'tf.io.deserialize_many_sparse',
+    'tf.device': 'tf.compat.v1.device',
     'tf.diag': 'tf.linalg.tensor_diag',
     'tf.diag_part': 'tf.linalg.tensor_diag_part',
     'tf.digamma': 'tf.math.digamma',
+    'tf.dimension_at_index': 'tf.compat.v1.dimension_at_index',
+    'tf.dimension_value': 'tf.compat.v1.dimension_value',
+    'tf.disable_resource_variables': 'tf.compat.v1.disable_resource_variables',
+    'tf.disable_v2_tensorshape': 'tf.compat.v1.disable_v2_tensorshape',
+    'tf.distributions.Bernoulli': 'tf.compat.v1.distributions.Bernoulli',
+    'tf.distributions.Beta': 'tf.compat.v1.distributions.Beta',
+    'tf.distributions.Categorical': 'tf.compat.v1.distributions.Categorical',
+    'tf.distributions.Dirichlet': 'tf.compat.v1.distributions.Dirichlet',
+    'tf.distributions.DirichletMultinomial': 'tf.compat.v1.distributions.DirichletMultinomial',
+    'tf.distributions.Distribution': 'tf.compat.v1.distributions.Distribution',
+    'tf.distributions.Exponential': 'tf.compat.v1.distributions.Exponential',
+    'tf.distributions.FULLY_REPARAMETERIZED': 'tf.compat.v1.distributions.FULLY_REPARAMETERIZED',
+    'tf.distributions.Gamma': 'tf.compat.v1.distributions.Gamma',
+    'tf.distributions.Laplace': 'tf.compat.v1.distributions.Laplace',
+    'tf.distributions.Multinomial': 'tf.compat.v1.distributions.Multinomial',
+    'tf.distributions.NOT_REPARAMETERIZED': 'tf.compat.v1.distributions.NOT_REPARAMETERIZED',
+    'tf.distributions.Normal': 'tf.compat.v1.distributions.Normal',
+    'tf.distributions.RegisterKL': 'tf.compat.v1.distributions.RegisterKL',
+    'tf.distributions.ReparameterizationType': 'tf.compat.v1.distributions.ReparameterizationType',
+    'tf.distributions.StudentT': 'tf.compat.v1.distributions.StudentT',
+    'tf.distributions.Uniform': 'tf.compat.v1.distributions.Uniform',
+    'tf.distributions.kl_divergence': 'tf.compat.v1.distributions.kl_divergence',
+    'tf.enable_resource_variables': 'tf.compat.v1.enable_resource_variables',
+    'tf.enable_v2_tensorshape': 'tf.compat.v1.enable_v2_tensorshape',
     'tf.encode_base64': 'tf.io.encode_base64',
     'tf.erf': 'tf.math.erf',
     'tf.erfc': 'tf.math.erfc',
@@ -80,15 +152,40 @@ renames = {
     'tf.fake_quant_with_min_max_vars_gradient': 'tf.quantization.fake_quant_with_min_max_vars_gradient',
     'tf.fake_quant_with_min_max_vars_per_channel': 'tf.quantization.fake_quant_with_min_max_vars_per_channel',
     'tf.fake_quant_with_min_max_vars_per_channel_gradient': 'tf.quantization.fake_quant_with_min_max_vars_per_channel_gradient',
-    'tf.fft': 'tf.spectral.fft',
+    'tf.fft': 'tf.signal.fft',
+    'tf.fft2d': 'tf.signal.fft2d',
+    'tf.fft3d': 'tf.signal.fft3d',
     'tf.floordiv': 'tf.math.floordiv',
+    'tf.get_default_session': 'tf.compat.v1.get_default_session',
+    'tf.get_local_variable': 'tf.compat.v1.get_local_variable',
     'tf.get_seed': 'tf.random.get_seed',
+    'tf.get_session_handle': 'tf.compat.v1.get_session_handle',
+    'tf.get_session_tensor': 'tf.compat.v1.get_session_tensor',
+    'tf.get_variable': 'tf.compat.v1.get_variable',
+    'tf.get_variable_scope': 'tf.compat.v1.get_variable_scope',
     'tf.global_norm': 'tf.linalg.global_norm',
+    'tf.global_variables': 'tf.compat.v1.global_variables',
+    'tf.global_variables_initializer': 'tf.compat.v1.global_variables_initializer',
     'tf.glorot_normal_initializer': 'tf.keras.initializers.glorot_normal',
-    'tf.ifft': 'tf.spectral.ifft',
+    'tf.graph_util.convert_variables_to_constants': 'tf.compat.v1.graph_util.convert_variables_to_constants',
+    'tf.graph_util.extract_sub_graph': 'tf.compat.v1.graph_util.extract_sub_graph',
+    'tf.graph_util.must_run_on_cpu': 'tf.compat.v1.graph_util.must_run_on_cpu',
+    'tf.graph_util.remove_training_nodes': 'tf.compat.v1.graph_util.remove_training_nodes',
+    'tf.graph_util.tensor_shape_from_node_def_name': 'tf.compat.v1.graph_util.tensor_shape_from_node_def_name',
+    'tf.ifft': 'tf.signal.ifft',
+    'tf.ifft2d': 'tf.signal.ifft2d',
+    'tf.ifft3d': 'tf.signal.ifft3d',
     'tf.igamma': 'tf.math.igamma',
     'tf.igammac': 'tf.math.igammac',
     'tf.imag': 'tf.math.imag',
+    'tf.initialize_all_tables': 'tf.compat.v1.initialize_all_tables',
+    'tf.initialize_all_variables': 'tf.compat.v1.initialize_all_variables',
+    'tf.initialize_local_variables': 'tf.compat.v1.initialize_local_variables',
+    'tf.initialize_variables': 'tf.compat.v1.initialize_variables',
+    'tf.initializers.global_variables': 'tf.compat.v1.initializers.global_variables',
+    'tf.initializers.local_variables': 'tf.compat.v1.initializers.local_variables',
+    'tf.initializers.tables_initializer': 'tf.compat.v1.initializers.tables_initializer',
+    'tf.initializers.variables': 'tf.compat.v1.initializers.variables',
     'tf.invert_permutation': 'tf.math.invert_permutation',
     'tf.is_finite': 'tf.debugging.is_finite',
     'tf.is_inf': 'tf.debugging.is_inf',
@@ -96,10 +193,54 @@ renames = {
     'tf.is_non_decreasing': 'tf.debugging.is_non_decreasing',
     'tf.is_numeric_tensor': 'tf.debugging.is_numeric_tensor',
     'tf.is_strictly_increasing': 'tf.debugging.is_strictly_increasing',
+    'tf.is_variable_initialized': 'tf.compat.v1.is_variable_initialized',
+    'tf.keras.backend.get_session': 'tf.compat.v1.keras.backend.get_session',
+    'tf.layers.AveragePooling1D': 'tf.compat.v1.layers.AveragePooling1D',
+    'tf.layers.AveragePooling2D': 'tf.compat.v1.layers.AveragePooling2D',
+    'tf.layers.AveragePooling3D': 'tf.compat.v1.layers.AveragePooling3D',
+    'tf.layers.BatchNormalization': 'tf.compat.v1.layers.BatchNormalization',
+    'tf.layers.Conv1D': 'tf.compat.v1.layers.Conv1D',
+    'tf.layers.Conv2D': 'tf.compat.v1.layers.Conv2D',
+    'tf.layers.Conv2DTranspose': 'tf.compat.v1.layers.Conv2DTranspose',
+    'tf.layers.Conv3D': 'tf.compat.v1.layers.Conv3D',
+    'tf.layers.Conv3DTranspose': 'tf.compat.v1.layers.Conv3DTranspose',
+    'tf.layers.Dense': 'tf.compat.v1.layers.Dense',
+    'tf.layers.Dropout': 'tf.compat.v1.layers.Dropout',
+    'tf.layers.Flatten': 'tf.compat.v1.layers.Flatten',
+    'tf.layers.InputSpec': 'tf.keras.layers.InputSpec',
+    'tf.layers.Layer': 'tf.compat.v1.layers.Layer',
+    'tf.layers.MaxPooling1D': 'tf.compat.v1.layers.MaxPooling1D',
+    'tf.layers.MaxPooling2D': 'tf.compat.v1.layers.MaxPooling2D',
+    'tf.layers.MaxPooling3D': 'tf.compat.v1.layers.MaxPooling3D',
+    'tf.layers.SeparableConv1D': 'tf.compat.v1.layers.SeparableConv1D',
+    'tf.layers.SeparableConv2D': 'tf.compat.v1.layers.SeparableConv2D',
+    'tf.layers.average_pooling1d': 'tf.compat.v1.layers.average_pooling1d',
+    'tf.layers.average_pooling2d': 'tf.compat.v1.layers.average_pooling2d',
+    'tf.layers.average_pooling3d': 'tf.compat.v1.layers.average_pooling3d',
+    'tf.layers.batch_normalization': 'tf.compat.v1.layers.batch_normalization',
+    'tf.layers.conv1d': 'tf.compat.v1.layers.conv1d',
+    'tf.layers.conv2d': 'tf.compat.v1.layers.conv2d',
+    'tf.layers.conv2d_transpose': 'tf.compat.v1.layers.conv2d_transpose',
+    'tf.layers.conv3d': 'tf.compat.v1.layers.conv3d',
+    'tf.layers.conv3d_transpose': 'tf.compat.v1.layers.conv3d_transpose',
+    'tf.layers.dense': 'tf.compat.v1.layers.dense',
+    'tf.layers.dropout': 'tf.compat.v1.layers.dropout',
+    'tf.layers.experimental.keras_style_scope': 'tf.compat.v1.layers.experimental.keras_style_scope',
+    'tf.layers.experimental.set_keras_style': 'tf.compat.v1.layers.experimental.set_keras_style',
+    'tf.layers.flatten': 'tf.compat.v1.layers.flatten',
+    'tf.layers.max_pooling1d': 'tf.compat.v1.layers.max_pooling1d',
+    'tf.layers.max_pooling2d': 'tf.compat.v1.layers.max_pooling2d',
+    'tf.layers.max_pooling3d': 'tf.compat.v1.layers.max_pooling3d',
+    'tf.layers.separable_conv1d': 'tf.compat.v1.layers.separable_conv1d',
+    'tf.layers.separable_conv2d': 'tf.compat.v1.layers.separable_conv2d',
     'tf.lbeta': 'tf.math.lbeta',
     'tf.lgamma': 'tf.math.lgamma',
+    'tf.local_variables': 'tf.compat.v1.local_variables',
+    'tf.local_variables_initializer': 'tf.compat.v1.local_variables_initializer',
     'tf.log_sigmoid': 'tf.math.log_sigmoid',
     'tf.logical_xor': 'tf.math.logical_xor',
+    'tf.make_template': 'tf.compat.v1.make_template',
+    'tf.make_tensor_proto': 'tf.compat.v1.make_tensor_proto',
     'tf.manip.batch_to_space_nd': 'tf.batch_to_space_nd',
     'tf.manip.gather_nd': 'tf.gather_nd',
     'tf.manip.reshape': 'tf.reshape',
@@ -109,6 +250,8 @@ renames = {
     'tf.manip.space_to_batch_nd': 'tf.space_to_batch_nd',
     'tf.manip.tile': 'tf.tile',
     'tf.matching_files': 'tf.io.matching_files',
+    'tf.math.argmax': 'tf.compat.v1.math.argmax',
+    'tf.math.argmin': 'tf.compat.v1.math.argmin',
     'tf.matrix_band_part': 'tf.linalg.band_part',
     'tf.matrix_determinant': 'tf.linalg.det',
     'tf.matrix_diag': 'tf.linalg.diag',
@@ -119,10 +262,29 @@ renames = {
     'tf.matrix_solve_ls': 'tf.linalg.lstsq',
     'tf.matrix_transpose': 'tf.linalg.transpose',
     'tf.matrix_triangular_solve': 'tf.linalg.triangular_solve',
+    'tf.model_variables': 'tf.compat.v1.model_variables',
+    'tf.moving_average_variables': 'tf.compat.v1.moving_average_variables',
+    'tf.nn.ctc_beam_search_decoder': 'tf.compat.v1.nn.ctc_beam_search_decoder',
+    'tf.nn.ctc_beam_search_decoder_v2': 'tf.nn.ctc_beam_search_decoder',
+    'tf.nn.dynamic_rnn': 'tf.compat.v1.nn.dynamic_rnn',
     'tf.nn.log_uniform_candidate_sampler': 'tf.random.log_uniform_candidate_sampler',
+    'tf.nn.raw_rnn': 'tf.compat.v1.nn.raw_rnn',
+    'tf.nn.rnn_cell.BasicLSTMCell': 'tf.compat.v1.nn.rnn_cell.BasicLSTMCell',
+    'tf.nn.rnn_cell.BasicRNNCell': 'tf.compat.v1.nn.rnn_cell.BasicRNNCell',
+    'tf.nn.rnn_cell.GRUCell': 'tf.compat.v1.nn.rnn_cell.GRUCell',
+    'tf.nn.rnn_cell.LSTMCell': 'tf.compat.v1.nn.rnn_cell.LSTMCell',
+    'tf.nn.softmax_cross_entropy_with_logits': 'tf.compat.v1.nn.softmax_cross_entropy_with_logits',
+    'tf.nn.softmax_cross_entropy_with_logits_v2': 'tf.nn.softmax_cross_entropy_with_logits',
+    'tf.nn.static_rnn': 'tf.compat.v1.nn.static_rnn',
     'tf.nn.uniform_candidate_sampler': 'tf.random.uniform_candidate_sampler',
+    'tf.op_scope': 'tf.compat.v1.op_scope',
     'tf.orthogonal_initializer': 'tf.keras.initializers.Orthogonal',
+    'tf.parse_example': 'tf.io.parse_example',
+    'tf.parse_single_example': 'tf.io.parse_single_example',
+    'tf.parse_single_sequence_example': 'tf.io.parse_single_sequence_example',
     'tf.parse_tensor': 'tf.io.parse_tensor',
+    'tf.placeholder': 'tf.compat.v1.placeholder',
+    'tf.placeholder_with_default': 'tf.compat.v1.placeholder_with_default',
     'tf.polygamma': 'tf.math.polygamma',
     'tf.python_io.TFRecordCompressionType': 'tf.io.TFRecordCompressionType',
     'tf.python_io.TFRecordOptions': 'tf.io.TFRecordOptions',
@@ -131,22 +293,76 @@ renames = {
     'tf.qr': 'tf.linalg.qr',
     'tf.quantize': 'tf.quantization.quantize',
     'tf.quantized_concat': 'tf.quantization.quantized_concat',
+    'tf.random_crop': 'tf.image.random_crop',
     'tf.random_gamma': 'tf.random.gamma',
+    'tf.random_normal': 'tf.random.normal',
     'tf.random_poisson': 'tf.random.poisson',
+    'tf.random_shuffle': 'tf.random.shuffle',
+    'tf.random_uniform': 'tf.random.uniform',
     'tf.read_file': 'tf.io.read_file',
     'tf.real': 'tf.math.real',
     'tf.reciprocal': 'tf.math.reciprocal',
     'tf.reduce_join': 'tf.strings.reduce_join',
     'tf.regex_replace': 'tf.strings.regex_replace',
+    'tf.report_uninitialized_variables': 'tf.compat.v1.report_uninitialized_variables',
+    'tf.resource_loader.get_data_files_path': 'tf.compat.v1.resource_loader.get_data_files_path',
+    'tf.resource_loader.get_path_to_datafile': 'tf.compat.v1.resource_loader.get_path_to_datafile',
+    'tf.resource_loader.get_root_dir_with_all_resources': 'tf.compat.v1.resource_loader.get_root_dir_with_all_resources',
+    'tf.resource_loader.load_resource': 'tf.compat.v1.resource_loader.load_resource',
+    'tf.resource_loader.readahead_file_path': 'tf.compat.v1.resource_loader.readahead_file_path',
     'tf.reverse_v2': 'tf.reverse',
     'tf.rint': 'tf.math.rint',
     'tf.rsqrt': 'tf.math.rsqrt',
+    'tf.saved_model.Builder': 'tf.compat.v1.saved_model.Builder',
+    'tf.saved_model.TRAINING': 'tf.saved_model.TRANING',
+    'tf.saved_model.build_tensor_info': 'tf.compat.v1.saved_model.build_tensor_info',
+    'tf.saved_model.builder.SavedModelBuilder': 'tf.compat.v1.saved_model.builder.SavedModelBuilder',
+    'tf.saved_model.constants.ASSETS_DIRECTORY': 'tf.saved_model.ASSETS_DIRECTORY',
+    'tf.saved_model.constants.ASSETS_KEY': 'tf.saved_model.ASSETS_KEY',
+    'tf.saved_model.constants.LEGACY_INIT_OP_KEY': 'tf.saved_model.LEGACY_INIT_OP_KEY',
+    'tf.saved_model.constants.MAIN_OP_KEY': 'tf.saved_model.MAIN_OP_KEY',
+    'tf.saved_model.constants.SAVED_MODEL_FILENAME_PB': 'tf.saved_model.SAVED_MODEL_FILENAME_PB',
+    'tf.saved_model.constants.SAVED_MODEL_FILENAME_PBTXT': 'tf.saved_model.SAVED_MODEL_FILENAME_PBTXT',
+    'tf.saved_model.constants.SAVED_MODEL_SCHEMA_VERSION': 'tf.saved_model.SAVED_MODEL_SCHEMA_VERSION',
+    'tf.saved_model.constants.VARIABLES_DIRECTORY': 'tf.saved_model.VARIABLES_DIRECTORY',
+    'tf.saved_model.constants.VARIABLES_FILENAME': 'tf.saved_model.VARIABLES_FILENAME',
+    'tf.saved_model.experimental.save': 'tf.saved_model.save',
+    'tf.saved_model.get_tensor_from_tensor_info': 'tf.compat.v1.saved_model.get_tensor_from_tensor_info',
+    'tf.saved_model.load': 'tf.compat.v1.saved_model.load',
+    'tf.saved_model.loader.load': 'tf.compat.v1.saved_model.loader.load',
     'tf.saved_model.loader.maybe_saved_model_directory': 'tf.saved_model.maybe_saved_model_directory',
+    'tf.saved_model.main_op.main_op': 'tf.compat.v1.saved_model.main_op.main_op',
+    'tf.saved_model.main_op.main_op_with_restore': 'tf.compat.v1.saved_model.main_op.main_op_with_restore',
+    'tf.saved_model.main_op_with_restore': 'tf.compat.v1.saved_model.main_op_with_restore',
+    'tf.saved_model.signature_constants.CLASSIFY_INPUTS': 'tf.saved_model.CLASSIFY_INPUTS',
+    'tf.saved_model.signature_constants.CLASSIFY_METHOD_NAME': 'tf.saved_model.CLASSIFY_METHOD_NAME',
+    'tf.saved_model.signature_constants.CLASSIFY_OUTPUT_CLASSES': 'tf.saved_model.CLASSIFY_OUTPUT_CLASSES',
+    'tf.saved_model.signature_constants.CLASSIFY_OUTPUT_SCORES': 'tf.saved_model.CLASSIFY_OUTPUT_SCORES',
+    'tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY': 'tf.saved_model.DEFAULT_SERVING_SIGNATURE_DEF_KEY',
+    'tf.saved_model.signature_constants.PREDICT_INPUTS': 'tf.saved_model.PREDICT_INPUTS',
+    'tf.saved_model.signature_constants.PREDICT_METHOD_NAME': 'tf.saved_model.PREDICT_METHOD_NAME',
+    'tf.saved_model.signature_constants.PREDICT_OUTPUTS': 'tf.saved_model.PREDICT_OUTPUTS',
+    'tf.saved_model.signature_constants.REGRESS_INPUTS': 'tf.saved_model.REGRESS_INPUTS',
+    'tf.saved_model.signature_constants.REGRESS_METHOD_NAME': 'tf.saved_model.REGRESS_METHOD_NAME',
+    'tf.saved_model.signature_constants.REGRESS_OUTPUTS': 'tf.saved_model.REGRESS_OUTPUTS',
     'tf.saved_model.signature_def_utils.build_signature_def': 'tf.saved_model.build_signature_def',
     'tf.saved_model.signature_def_utils.classification_signature_def': 'tf.saved_model.classification_signature_def',
     'tf.saved_model.signature_def_utils.is_valid_signature': 'tf.saved_model.is_valid_signature',
     'tf.saved_model.signature_def_utils.predict_signature_def': 'tf.saved_model.predict_signature_def',
     'tf.saved_model.signature_def_utils.regression_signature_def': 'tf.saved_model.regression_signature_def',
+    'tf.saved_model.simple_save': 'tf.compat.v1.saved_model.simple_save',
+    'tf.saved_model.tag_constants.GPU': 'tf.saved_model.GPU',
+    'tf.saved_model.tag_constants.SERVING': 'tf.saved_model.SERVING',
+    'tf.saved_model.tag_constants.TPU': 'tf.saved_model.TPU',
+    'tf.saved_model.tag_constants.TRAINING': 'tf.saved_model.TRANING',
+    'tf.saved_model.utils.build_tensor_info': 'tf.compat.v1.saved_model.utils.build_tensor_info',
+    'tf.saved_model.utils.get_tensor_from_tensor_info': 'tf.compat.v1.saved_model.utils.get_tensor_from_tensor_info',
+    'tf.scatter_add': 'tf.compat.v1.scatter_add',
+    'tf.scatter_nd_add': 'tf.compat.v1.scatter_nd_add',
+    'tf.scatter_nd_sub': 'tf.compat.v1.scatter_nd_sub',
+    'tf.scatter_nd_update': 'tf.compat.v1.scatter_nd_update',
+    'tf.scatter_sub': 'tf.compat.v1.scatter_sub',
+    'tf.scatter_update': 'tf.compat.v1.scatter_update',
     'tf.segment_max': 'tf.math.segment_max',
     'tf.segment_mean': 'tf.math.segment_mean',
     'tf.segment_min': 'tf.math.segment_min',
@@ -156,15 +372,19 @@ renames = {
     'tf.self_adjoint_eigvals': 'tf.linalg.eigvalsh',
     'tf.serialize_many_sparse': 'tf.io.serialize_many_sparse',
     'tf.serialize_sparse': 'tf.io.serialize_sparse',
+    'tf.serialize_tensor': 'tf.io.serialize_tensor',
+    'tf.setdiff1d': 'tf.compat.v1.setdiff1d',
     'tf.space_to_batch': 'tf.nn.space_to_batch',
     'tf.space_to_depth': 'tf.nn.space_to_depth',
+    'tf.sparse.placeholder': 'tf.compat.v1.sparse.placeholder',
     'tf.sparse_add': 'tf.sparse.add',
     'tf.sparse_fill_empty_rows': 'tf.sparse.fill_empty_rows',
     'tf.sparse_mask': 'tf.sparse.mask',
+    'tf.sparse_matmul': 'tf.compat.v1.sparse_matmul',
     'tf.sparse_maximum': 'tf.sparse.maximum',
     'tf.sparse_merge': 'tf.sparse.merge',
     'tf.sparse_minimum': 'tf.sparse.minimum',
-    'tf.sparse_placeholder': 'tf.sparse.placeholder',
+    'tf.sparse_placeholder': 'tf.compat.v1.sparse_placeholder',
     'tf.sparse_reorder': 'tf.sparse.reorder',
     'tf.sparse_reset_shape': 'tf.sparse.reset_shape',
     'tf.sparse_reshape': 'tf.sparse.reshape',
@@ -178,6 +398,12 @@ renames = {
     'tf.sparse_tensor_to_dense': 'tf.sparse.to_dense',
     'tf.sparse_to_indicator': 'tf.sparse.to_indicator',
     'tf.sparse_transpose': 'tf.sparse.transpose',
+    'tf.spectral.fft': 'tf.signal.fft',
+    'tf.spectral.fft2d': 'tf.signal.fft2d',
+    'tf.spectral.fft3d': 'tf.signal.fft3d',
+    'tf.spectral.ifft': 'tf.signal.ifft',
+    'tf.spectral.ifft2d': 'tf.signal.ifft2d',
+    'tf.spectral.ifft3d': 'tf.signal.ifft3d',
     'tf.squared_difference': 'tf.math.squared_difference',
     'tf.string_join': 'tf.strings.join',
     'tf.string_strip': 'tf.strings.strip',
@@ -186,9 +412,67 @@ renames = {
     'tf.string_to_hash_bucket_strong': 'tf.strings.to_hash_bucket_strong',
     'tf.string_to_number': 'tf.strings.to_number',
     'tf.svd': 'tf.linalg.svd',
+    'tf.tables_initializer': 'tf.compat.v1.tables_initializer',
+    'tf.test.compute_gradient_error': 'tf.compat.v1.test.compute_gradient_error',
+    'tf.test.get_temp_dir': 'tf.compat.v1.test.get_temp_dir',
+    'tf.test.test_src_dir_path': 'tf.compat.v1.test.test_src_dir_path',
+    'tf.to_bfloat16': 'tf.compat.v1.to_bfloat16',
+    'tf.to_complex128': 'tf.compat.v1.to_complex128',
+    'tf.to_complex64': 'tf.compat.v1.to_complex64',
+    'tf.to_double': 'tf.compat.v1.to_double',
+    'tf.to_float': 'tf.compat.v1.to_float',
+    'tf.to_int32': 'tf.compat.v1.to_int32',
+    'tf.to_int64': 'tf.compat.v1.to_int64',
     'tf.trace': 'tf.linalg.trace',
-    'tf.train.confusion_matrix': 'tf.math.confusion_matrix',
+    'tf.train.MonitoredTrainingSession': 'tf.compat.v1.train.MonitoredTrainingSession',
+    'tf.train.QueueRunner': 'tf.compat.v1.train.QueueRunner',
+    'tf.train.Saver': 'tf.compat.v1.train.Saver',
+    'tf.train.SaverDef': 'tf.compat.v1.train.SaverDef',
+    'tf.train.SyncReplicasOptimizer': 'tf.compat.v1.train.SyncReplicasOptimizer',
+    'tf.train.add_queue_runner': 'tf.compat.v1.train.add_queue_runner',
+    'tf.train.assert_global_step': 'tf.compat.v1.train.assert_global_step',
+    'tf.train.basic_train_loop': 'tf.compat.v1.train.basic_train_loop',
+    'tf.train.batch': 'tf.compat.v1.train.batch',
+    'tf.train.batch_join': 'tf.compat.v1.train.batch_join',
+    'tf.train.checkpoint_exists': 'tf.compat.v1.train.checkpoint_exists',
+    'tf.train.cosine_decay': 'tf.compat.v1.train.cosine_decay',
+    'tf.train.cosine_decay_restarts': 'tf.compat.v1.train.cosine_decay_restarts',
+    'tf.train.create_global_step': 'tf.compat.v1.train.create_global_step',
+    'tf.train.do_quantize_training_on_graphdef': 'tf.compat.v1.train.do_quantize_training_on_graphdef',
+    'tf.train.exponential_decay': 'tf.compat.v1.train.exponential_decay',
+    'tf.train.export_meta_graph': 'tf.compat.v1.train.export_meta_graph',
+    'tf.train.generate_checkpoint_state_proto': 'tf.compat.v1.train.generate_checkpoint_state_proto',
+    'tf.train.get_checkpoint_mtimes': 'tf.compat.v1.train.get_checkpoint_mtimes',
+    'tf.train.get_global_step': 'tf.compat.v1.train.get_global_step',
+    'tf.train.get_or_create_global_step': 'tf.compat.v1.train.get_or_create_global_step',
+    'tf.train.global_step': 'tf.compat.v1.train.global_step',
+    'tf.train.import_meta_graph': 'tf.compat.v1.train.import_meta_graph',
+    'tf.train.input_producer': 'tf.compat.v1.train.input_producer',
+    'tf.train.inverse_time_decay': 'tf.compat.v1.train.inverse_time_decay',
+    'tf.train.limit_epochs': 'tf.compat.v1.train.limit_epochs',
+    'tf.train.linear_cosine_decay': 'tf.compat.v1.train.linear_cosine_decay',
     'tf.train.match_filenames_once': 'tf.io.match_filenames_once',
+    'tf.train.maybe_batch': 'tf.compat.v1.train.maybe_batch',
+    'tf.train.maybe_batch_join': 'tf.compat.v1.train.maybe_batch_join',
+    'tf.train.maybe_shuffle_batch': 'tf.compat.v1.train.maybe_shuffle_batch',
+    'tf.train.maybe_shuffle_batch_join': 'tf.compat.v1.train.maybe_shuffle_batch_join',
+    'tf.train.natural_exp_decay': 'tf.compat.v1.train.natural_exp_decay',
+    'tf.train.noisy_linear_cosine_decay': 'tf.compat.v1.train.noisy_linear_cosine_decay',
+    'tf.train.piecewise_constant': 'tf.compat.v1.train.piecewise_constant',
+    'tf.train.polynomial_decay': 'tf.compat.v1.train.polynomial_decay',
+    'tf.train.queue_runner.QueueRunner': 'tf.compat.v1.train.queue_runner.QueueRunner',
+    'tf.train.queue_runner.add_queue_runner': 'tf.compat.v1.train.queue_runner.add_queue_runner',
+    'tf.train.queue_runner.start_queue_runners': 'tf.compat.v1.train.queue_runner.start_queue_runners',
+    'tf.train.range_input_producer': 'tf.compat.v1.train.range_input_producer',
+    'tf.train.remove_checkpoint': 'tf.compat.v1.train.remove_checkpoint',
+    'tf.train.shuffle_batch': 'tf.compat.v1.train.shuffle_batch',
+    'tf.train.shuffle_batch_join': 'tf.compat.v1.train.shuffle_batch_join',
+    'tf.train.slice_input_producer': 'tf.compat.v1.train.slice_input_producer',
+    'tf.train.start_queue_runners': 'tf.compat.v1.train.start_queue_runners',
+    'tf.train.string_input_producer': 'tf.compat.v1.train.string_input_producer',
+    'tf.train.update_checkpoint_state': 'tf.compat.v1.train.update_checkpoint_state',
+    'tf.train.write_graph': 'tf.io.write_graph',
+    'tf.trainable_variables': 'tf.compat.v1.trainable_variables',
     'tf.uniform_unit_scaling_initializer': 'tf.initializers.uniform_unit_scaling',
     'tf.unsorted_segment_max': 'tf.math.unsorted_segment_max',
     'tf.unsorted_segment_mean': 'tf.math.unsorted_segment_mean',
@@ -196,6 +480,10 @@ renames = {
     'tf.unsorted_segment_prod': 'tf.math.unsorted_segment_prod',
     'tf.unsorted_segment_sqrt_n': 'tf.math.unsorted_segment_sqrt_n',
     'tf.unsorted_segment_sum': 'tf.math.unsorted_segment_sum',
+    'tf.variable_creator_scope': 'tf.compat.v1.variable_creator_scope',
+    'tf.variable_op_scope': 'tf.compat.v1.variable_op_scope',
+    'tf.variable_scope': 'tf.compat.v1.variable_scope',
+    'tf.variables_initializer': 'tf.compat.v1.variables_initializer',
     'tf.variance_scaling_initializer': 'tf.keras.initializers.VarianceScaling',
     'tf.verify_tensor_all_finite': 'tf.debugging.assert_all_finite',
     'tf.write_file': 'tf.io.write_file',
diff --git a/tensorflow/tools/compatibility/tf_upgrade.py b/tensorflow/tools/compatibility/tf_upgrade.py
index 2dabf7834da..287d1a5483c 100644
--- a/tensorflow/tools/compatibility/tf_upgrade.py
+++ b/tensorflow/tools/compatibility/tf_upgrade.py
@@ -102,7 +102,7 @@ class TFAPIChangeSpec(ast_edits.APIChangeSpec):
     }
 
     # Mapping from function to the new name of the function
-    self.function_renames = {
+    self.symbol_renames = {
         "tf.inv": "tf.reciprocal",
         "tf.contrib.deprecated.scalar_summary": "tf.summary.scalar",
         "tf.contrib.deprecated.histogram_summary": "tf.summary.histogram",
diff --git a/tensorflow/tools/compatibility/tf_upgrade_v2.py b/tensorflow/tools/compatibility/tf_upgrade_v2.py
index c11fcdb96d8..0df8b0f3769 100644
--- a/tensorflow/tools/compatibility/tf_upgrade_v2.py
+++ b/tensorflow/tools/compatibility/tf_upgrade_v2.py
@@ -30,51 +30,24 @@ class TFAPIChangeSpec(ast_edits.APIChangeSpec):
   def __init__(self):
     # Maps from a function name to a dictionary that describes how to
     # map from an old argument keyword to the new argument keyword.
-    self.function_keyword_renames = {}
+    self.function_keyword_renames = {
+        "tf.convert_to_tensor": {
+            "preferred_dtype": "dtype_hint"
+        },
+    }
 
     # Mapping from function to the new name of the function
-    self.function_renames = renames_v2.renames
+    self.symbol_renames = renames_v2.renames
     # pylint: disable=line-too-long
-    self.function_renames.update({
-        "tf.FixedLengthRecordReader": "tf.compat.v1.FixedLengthRecordReader",
-        "tf.IdentityReader": "tf.compat.v1.IdentityReader",
-        "tf.LMDBReader": "tf.compat.v1.LMDBReader",
-        "tf.ReaderBase": "tf.compat.v1.ReaderBase",
-        "tf.TFRecordReader": "tf.compat.v1.TFRecordReader",
-        "tf.TextLineReader": "tf.compat.v1.TextLineReader",
-        "tf.WholeFileReader": "tf.compat.v1.WholeFileReader",
-        "tf.saved_model.builder.SavedModelBuilder": "tf.compat.v1.saved_model.Builder",
-        "tf.saved_model.loader.load": "tf.compat.v1.saved_model.load",
-        "tf.saved_model.main_op.main_op": "tf.compat.v1.saved_model.main_op",
-        "tf.saved_model.main_op.main_op_with_restore": "tf.compat.v1.saved_model.main_op_with_restore",
-        "tf.saved_model.simple_save": "tf.compat.v1.saved_model.simple_save",
-        "tf.saved_model.utils.build_tensor_info": "tf.compat.v1.saved_model.build_tensor_info",
-        "tf.saved_model.utils.get_tensor_from_tensor_info": "tf.compat.v1.saved_model.get_tensor_from_tensor_info",
-        "tf.train.QueueRunner": "tf.compat.v1.QueueRunner",
-        "tf.train.add_queue_runner": "tf.compat.v1.add_queue_runner",
-        "tf.train.batch": "tf.compat.v1.train.batch",
-        "tf.train.batch_join": "tf.compat.v1.train.batch_join",
-        "tf.train.input_producer": "tf.compat.v1.train.input_producer",
-        "tf.train.limit_epochs": "tf.compat.v1.train.limit_epochs",
-        "tf.train.maybe_batch": "tf.compat.v1.train.maybe_batch",
-        "tf.train.maybe_batch_join": "tf.compat.v1.train.maybe_batch_join",
-        "tf.train.maybe_shuffle_batch": "tf.compat.v1.train.maybe_shuffle_batch",
-        "tf.train.maybe_shuffle_batch_join": "tf.compat.v1.train.maybe_shuffle_batch_join",
-        "tf.train.queue_runner.QueueRunner": "tf.compat.v1.queue_runner.QueueRunner",
-        "tf.train.queue_runner.add_queue_runner": "tf.compat.v1.queue_runner.add_queue_runner",
-        "tf.train.queue_runner.start_queue_runners": "tf.compat.v1.queue_runner.start_queue_runners",
-        "tf.train.range_input_producer": "tf.compat.v1.train.range_input_producer",
-        "tf.train.shuffle_batch": "tf.compat.v1.train.shuffle_batch",
-        "tf.train.shuffle_batch_join": "tf.compat.v1.train.shuffle_batch_join",
-        "tf.train.slice_input_producer": "tf.compat.v1.train.slice_input_producer",
-        "tf.train.string_input_producer": "tf.compat.v1.train.string_input_producer",
-        "tf.train.start_queue_runners": "tf.compat.v1.start_queue_runners",
+    # Add additional renames not in renames_v2.py here.
+    self.symbol_renames.update({
     })
     # pylint: enable=line-too-long
 
-    # TODO(amitpatankar): Fix the function rename script
-    # to handle constants without hardcoding.
-    self.function_renames["QUANTIZED_DTYPES"] = "dtypes.QUANTIZED_DTYPES"
+    # For custom behavior and if auto-generate rename in renames_v2.py
+    # is incorrect, add the op name here to exclude it from renames_v2.py.
+    excluded_renames = [
+    ]
 
     # Variables that should be changed to functions.
     self.change_to_function = {}
@@ -82,7 +55,12 @@ class TFAPIChangeSpec(ast_edits.APIChangeSpec):
     # Functions that were reordered should be changed to the new keyword args
     # for safety, if positional arguments are used. If you have reversed the
     # positional arguments yourself, this could do the wrong thing.
-    self.function_reorders = {}
+    self.function_reorders = {
+        "tf.convert_to_tensor": ["value", "dtype", "preferred_dtype", "name"],
+        "tf.argmin": ["input", "axis", "output_type", "name"],
+        "tf.argmax": ["input", "axis", "output_type", "name"],
+        "tf.boolean_mask": ["tensor", "mask", "name", "axis"],
+    }
 
     # Specially handled functions.
     self.function_handle = {}
@@ -95,6 +73,12 @@ class TFAPIChangeSpec(ast_edits.APIChangeSpec):
         "you need to inspect this usage manually.\n"
     )
 
+    # TODO(b/118888586): add default value change to update script.
+    default_loss_reduction_changed = (
+        "WARNING: default value of loss_reduction has been changed to "
+        "SUM_OVER_BATCH_SIZE.\n"
+    )
+
     # Function warnings. <function name> placeholder inside warnings will be
     # replaced by function name.
     self.function_warnings = {
@@ -107,6 +91,13 @@ class TFAPIChangeSpec(ast_edits.APIChangeSpec):
         "tf.train.cosine_decay_restarts": decay_function_comment,
         "tf.train.linear_cosine_decay": decay_function_comment,
         "tf.train.noisy_linear_cosine_decay": decay_function_comment,
+        "tf.estimator.LinearClassifier": default_loss_reduction_changed,
+    }
+    # Right now we can't have both a rename and a warning.
+    self.symbol_renames = {
+        name: new_name
+        for name, new_name in self.symbol_renames.items()
+        if name not in self.function_warnings and name not in excluded_renames
     }
 
 
diff --git a/tensorflow/tools/compatibility/tf_upgrade_v2_test.py b/tensorflow/tools/compatibility/tf_upgrade_v2_test.py
index 6a0c3a787da..9060b1c71f1 100644
--- a/tensorflow/tools/compatibility/tf_upgrade_v2_test.py
+++ b/tensorflow/tools/compatibility/tf_upgrade_v2_test.py
@@ -64,6 +64,20 @@ class TestUpgrade(test_util.TensorFlowTestCase):
     _, unused_report, unused_errors, new_text = self._upgrade(text)
     self.assertEqual(new_text, "tf.math.rsqrt(tf.math.log_sigmoid(3.8))\n")
 
+  def testRenameConstant(self):
+    text = "tf.MONOLITHIC_BUILD\n"
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text, "tf.sysconfig.MONOLITHIC_BUILD\n")
+    text = "some_call(tf.MONOLITHIC_BUILD)\n"
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text, "some_call(tf.sysconfig.MONOLITHIC_BUILD)\n")
+
+  def testReorder(self):
+    text = "tf.boolean_mask(a, b, c, d)\n"
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text,
+                     "tf.boolean_mask(tensor=a, mask=b, name=c, axis=d)\n")
+
   def testLearningRateDecay(self):
     for decay in ["tf.train.exponential_decay", "tf.train.piecewise_constant",
                   "tf.train.polynomial_decay", "tf.train.natural_exp_decay",
@@ -78,6 +92,14 @@ class TestUpgrade(test_util.TensorFlowTestCase):
       self.assertEqual(errors, ["test.py:1: %s requires manual check." % decay])
       self.assertIn("%s has been changed" % decay, report)
 
+  def testEstimatorLossReductionChangege(self):
+    text = "tf.estimator.LinearClassifier(a, b)\n"
+    _, report, errors, new_text = self._upgrade(text)
+    self.assertEqual(text, new_text)
+    self.assertEqual(errors, ["test.py:1: %s requires manual check."
+                              % "tf.estimator.LinearClassifier"])
+    self.assertIn("loss_reduction has been changed", report)
+
 
 class TestUpgradeFiles(test_util.TensorFlowTestCase):
 
diff --git a/tensorflow/tools/compatibility/update/BUILD b/tensorflow/tools/compatibility/update/BUILD
index feb37c902ec..0ee45508155 100644
--- a/tensorflow/tools/compatibility/update/BUILD
+++ b/tensorflow/tools/compatibility/update/BUILD
@@ -9,6 +9,7 @@ py_binary(
     deps = [
         "//tensorflow:tensorflow_py",
         "//tensorflow/python:lib",
+        "//tensorflow/python:no_contrib",
         "//tensorflow/tools/common:public_api",
         "//tensorflow/tools/common:traverse",
     ],
diff --git a/tensorflow/tools/compatibility/update/generate_v2_renames_map.py b/tensorflow/tools/compatibility/update/generate_v2_renames_map.py
index 7d6beca3586..43aa8e057e1 100644
--- a/tensorflow/tools/compatibility/update/generate_v2_renames_map.py
+++ b/tensorflow/tools/compatibility/update/generate_v2_renames_map.py
@@ -20,10 +20,14 @@ To update renames_v2.py, run:
   bazel-bin/tensorflow/tools/compatibility/update/generate_v2_renames_map
 """
 # pylint: enable=line-too-long
+import sys
 
 import tensorflow as tf
 
+# This import is needed so that TensorFlow python modules are in sys.modules.
+from tensorflow import python as tf_python  # pylint: disable=unused-import
 from tensorflow.python.lib.io import file_io
+from tensorflow.python.platform import app
 from tensorflow.python.util import tf_decorator
 from tensorflow.python.util import tf_export
 from tensorflow.tools.common import public_api
@@ -59,6 +63,95 @@ from __future__ import print_function
 
 """
 
+_TENSORFLOW_API_ATTR_V1 = (
+    tf_export.API_ATTRS_V1[tf_export.TENSORFLOW_API_NAME].names)
+_TENSORFLOW_API_ATTR = tf_export.API_ATTRS[tf_export.TENSORFLOW_API_NAME].names
+_TENSORFLOW_CONSTANTS_ATTR_V1 = (
+    tf_export.API_ATTRS_V1[tf_export.TENSORFLOW_API_NAME].constants)
+_TENSORFLOW_CONSTANTS_ATTR = (
+    tf_export.API_ATTRS[tf_export.TENSORFLOW_API_NAME].constants)
+
+
+def get_canonical_name(v2_names, v1_name):
+  if v2_names:
+    return v2_names[0]
+  return 'compat.v1.%s' % v1_name
+
+
+def collect_constant_renames():
+  """Looks for constants that need to be renamed in TF 2.0.
+
+  Returns:
+    List of tuples of the form (current name, new name).
+  """
+  renames = set()
+  for module in sys.modules.values():
+    if not hasattr(module, _TENSORFLOW_CONSTANTS_ATTR_V1):
+      continue
+    constants_v1_list = getattr(module, _TENSORFLOW_CONSTANTS_ATTR_V1)
+    constants_v2_list = getattr(module, _TENSORFLOW_CONSTANTS_ATTR)
+
+    # _tf_api_constants attribute contains a list of tuples:
+    # (api_names_list, constant_name)
+    # We want to find API names that are in V1 but not in V2 for the same
+    # constant_names.
+
+    # First, we convert constants_v1_list and constants_v2_list to
+    # dictionaries for easier lookup.
+    constants_v1 = {constant_name: api_names
+                    for api_names, constant_name in constants_v1_list}
+    constants_v2 = {constant_name: api_names
+                    for api_names, constant_name in constants_v2_list}
+    # Second, we look for names that are in V1 but not in V2.
+    for constant_name, api_names_v1 in constants_v1.items():
+      api_names_v2 = constants_v2[constant_name]
+      for name in api_names_v1:
+        if name not in api_names_v2:
+          renames.add((name, get_canonical_name(api_names_v2, name)))
+  return renames
+
+
+def collect_function_renames():
+  """Looks for functions/classes that need to be renamed in TF 2.0.
+
+  Returns:
+    List of tuples of the form (current name, new name).
+  """
+  # Set of rename lines to write to output file in the form:
+  #   'tf.deprecated_name': 'tf.canonical_name'
+  renames = set()
+  v2_names = set()  # All op names in TensorFlow 2.0
+
+  def visit(unused_path, unused_parent, children):
+    """Visitor that collects rename strings to add to rename_line_set."""
+    for child in children:
+      _, attr = tf_decorator.unwrap(child[1])
+      if not hasattr(attr, '__dict__'):
+        continue
+      api_names_v1 = attr.__dict__.get(_TENSORFLOW_API_ATTR_V1, [])
+      api_names_v2 = attr.__dict__.get(_TENSORFLOW_API_ATTR, [])
+      deprecated_api_names = set(api_names_v1) - set(api_names_v2)
+      for name in deprecated_api_names:
+        renames.add((name, get_canonical_name(api_names_v2, name)))
+      for name in api_names_v2:
+        v2_names.add(name)
+
+  visitor = public_api.PublicAPIVisitor(visit)
+  visitor.do_not_descend_map['tf'].append('contrib')
+  visitor.do_not_descend_map['tf.compat'] = ['v1', 'v2']
+  traverse.traverse(tf, visitor)
+
+  # It is possible that a different function is exported with the
+  # same name. For e.g. when creating a different function to
+  # rename arguments. Exclude it from renames in this case.
+  renames = {name: new_name for name, new_name in renames.items()
+             if name not in v2_names}
+  return renames
+
+
+def get_rename_line(name, canonical_name):
+  return '    \'tf.%s\': \'tf.%s\'' % (name, canonical_name)
+
 
 def update_renames_v2(output_file_path):
   """Writes a Python dictionary mapping deprecated to canonical API names.
@@ -67,32 +160,17 @@ def update_renames_v2(output_file_path):
     output_file_path: File path to write output to. Any existing contents
       would be replaced.
   """
-  # Set of rename lines to write to output file in the form:
+  function_renames = collect_function_renames()
+  constant_renames = collect_constant_renames()
+  all_renames = function_renames.union(constant_renames)
+
+  # List of rename lines to write to output file in the form:
   #   'tf.deprecated_name': 'tf.canonical_name'
-  rename_line_set = set()
-  # _tf_api_names attribute name
-  tensorflow_api_attr = tf_export.API_ATTRS[tf_export.TENSORFLOW_API_NAME].names
-
-  def visit(unused_path, unused_parent, children):
-    """Visitor that collects rename strings to add to rename_line_set."""
-    for child in children:
-      _, attr = tf_decorator.unwrap(child[1])
-      if not hasattr(attr, '__dict__'):
-        continue
-      api_names = attr.__dict__.get(tensorflow_api_attr, [])
-      deprecated_api_names = attr.__dict__.get('_tf_deprecated_api_names', [])
-      canonical_name = tf_export.get_canonical_name(
-          api_names, deprecated_api_names)
-      for name in deprecated_api_names:
-        rename_line_set.add('    \'tf.%s\': \'tf.%s\'' % (name, canonical_name))
-
-  visitor = public_api.PublicAPIVisitor(visit)
-  visitor.do_not_descend_map['tf'].append('contrib')
-  visitor.do_not_descend_map['tf.compat'] = ['v1', 'v2']
-  traverse.traverse(tf, visitor)
-
+  rename_lines = [
+      get_rename_line(name, canonical_name)
+      for name, canonical_name in all_renames]
   renames_file_text = '%srenames = {\n%s\n}\n' % (
-      _FILE_HEADER, ',\n'.join(sorted(rename_line_set)))
+      _FILE_HEADER, ',\n'.join(sorted(rename_lines)))
   file_io.write_string_to_file(output_file_path, renames_file_text)
 
 
@@ -101,4 +179,4 @@ def main(unused_argv):
 
 
 if __name__ == '__main__':
-  tf.app.run(main=main)
+  app.run(main=main)
diff --git a/tensorflow/tools/docker/LICENSE b/tensorflow/tools/docker/LICENSE
index 28711d7885d..dea770e05ee 100644
--- a/tensorflow/tools/docker/LICENSE
+++ b/tensorflow/tools/docker/LICENSE
@@ -1,4 +1,4 @@
-Copyright 2015 The TensorFlow Authors.  All rights reserved.
+Copyright 2018 The TensorFlow Authors.  All rights reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/tensorflow/tools/docker/notebooks/LICENSE b/tensorflow/tools/docker/notebooks/LICENSE
index 28711d7885d..dea770e05ee 100644
--- a/tensorflow/tools/docker/notebooks/LICENSE
+++ b/tensorflow/tools/docker/notebooks/LICENSE
@@ -1,4 +1,4 @@
-Copyright 2015 The TensorFlow Authors.  All rights reserved.
+Copyright 2018 The TensorFlow Authors.  All rights reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/tensorflow/tools/lib_package/BUILD b/tensorflow/tools/lib_package/BUILD
index aff26bf0fb3..7aaa845ae92 100644
--- a/tensorflow/tools/lib_package/BUILD
+++ b/tensorflow/tools/lib_package/BUILD
@@ -204,6 +204,9 @@ genrule(
         "@protobuf_archive//:LICENSE",
         "@snappy//:COPYING",
         "@zlib_archive//:zlib.h",
+        "@grpc//:LICENSE",
+        "@grpc//third_party/address_sorting:LICENSE",
+        "@grpc//third_party/nanopb:LICENSE.txt",
     ] + select({
         "//tensorflow:android": [],
         "//tensorflow:ios": [],
diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD
index f9b0a1129b7..3a863d3c523 100644
--- a/tensorflow/tools/pip_package/BUILD
+++ b/tensorflow/tools/pip_package/BUILD
@@ -78,7 +78,6 @@ COMMON_PIP_DEPS = [
     "//tensorflow/contrib/rpc:rpc_pip",
     "//tensorflow/contrib/session_bundle:session_bundle_pip",
     "//tensorflow/contrib/signal:signal_py",
-    "//tensorflow/contrib/signal:test_util",
     "//tensorflow/contrib/slim:slim",
     "//tensorflow/contrib/slim/python/slim/data:data_pip",
     "//tensorflow/contrib/slim/python/slim/nets:nets_pip",
@@ -109,6 +108,7 @@ COMMON_PIP_DEPS = [
     "//tensorflow/python/data/kernel_tests:test_base",
     "//tensorflow/python/debug:debug_pip",
     "//tensorflow/python/eager:eager_pip",
+    "//tensorflow/python/kernel_tests/signal:test_util",
     "//tensorflow/python/kernel_tests/testdata:self_adjoint_eig_op_test_files",
     "//tensorflow/python/saved_model:saved_model",
     "//tensorflow/python/tools:tools_pip",
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index d3a3204b234..6d3562caef6 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -134,11 +134,12 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
     tf_http_archive(
         name = "eigen_archive",
         build_file = clean_dep("//third_party:eigen.BUILD"),
-        sha256 = "d956415d784fa4e42b6a2a45c32556d6aec9d0a3d8ef48baee2522ab762556a9",
-        strip_prefix = "eigen-eigen-fd6845384b86",
+        patch_file = clean_dep("//third_party:eigen_reshaped.patch"),
+        sha256 = "d66cec3b54b3dfaa4666c1d49481a7197f93fc078cd53c54e2b4a8893a529c9f",
+        strip_prefix = "eigen-eigen-b4890dc6bc34",
         urls = [
-            "https://mirror.bazel.build/bitbucket.org/eigen/eigen/get/fd6845384b86.tar.gz",
-            "https://bitbucket.org/eigen/eigen/get/fd6845384b86.tar.gz",
+            "https://mirror.bazel.build/bitbucket.org/eigen/eigen/get/b4890dc6bc34.tar.gz",
+            "https://bitbucket.org/eigen/eigen/get/b4890dc6bc34.tar.gz",
         ],
     )
 
@@ -347,11 +348,11 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
     )
 
     PROTOBUF_URLS = [
-        "https://mirror.bazel.build/github.com/google/protobuf/archive/v3.6.0.tar.gz",
-        "https://github.com/google/protobuf/archive/v3.6.0.tar.gz",
+        "https://mirror.bazel.build/github.com/google/protobuf/archive/v3.6.1.tar.gz",
+        "https://github.com/google/protobuf/archive/v3.6.1.tar.gz",
     ]
-    PROTOBUF_SHA256 = "50a5753995b3142627ac55cfd496cebc418a2e575ca0236e29033c67bd5665f4"
-    PROTOBUF_STRIP_PREFIX = "protobuf-3.6.0"
+    PROTOBUF_SHA256 = "3d4e589d81b2006ca603c1ab712c9715a76227293032d05b26fca603f90b3f5b"
+    PROTOBUF_STRIP_PREFIX = "protobuf-3.6.1"
 
     tf_http_archive(
         name = "protobuf_archive",
@@ -472,11 +473,11 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
     tf_http_archive(
         name = "llvm",
         build_file = clean_dep("//third_party/llvm:llvm.autogenerated.BUILD"),
-        sha256 = "2342cb98083eb1191a8411542dcd57cb3efc28677be4412e166f40cf22bd2b8c",
-        strip_prefix = "llvm-3fe1b12fca949399a3334a072ee7f96e2b6f557e",
+        sha256 = "a22a9b4c3af50a52ba0015b6987bba7202c3ec8e1d40ae76ee7d7643638936ae",
+        strip_prefix = "llvm-b4ace5f3454131a3070ef7c11e19e42fc9a80b4e",
         urls = [
-            "https://mirror.bazel.build/github.com/llvm-mirror/llvm/archive/3fe1b12fca949399a3334a072ee7f96e2b6f557e.tar.gz",
-            "https://github.com/llvm-mirror/llvm/archive/3fe1b12fca949399a3334a072ee7f96e2b6f557e.tar.gz",
+            "https://mirror.bazel.build/github.com/llvm-mirror/llvm/archive/b4ace5f3454131a3070ef7c11e19e42fc9a80b4e.tar.gz",
+            "https://github.com/llvm-mirror/llvm/archive/b4ace5f3454131a3070ef7c11e19e42fc9a80b4e.tar.gz",
         ],
     )
 
@@ -805,11 +806,11 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
     tf_http_archive(
         name = "ngraph",
         build_file = clean_dep("//third_party/ngraph:ngraph.BUILD"),
-        sha256 = "bf9dcc88e5c66021e3aac80491a231711211540d613bf9b6bd28db3f5bb86b62",
-        strip_prefix = "ngraph-0.8.1",
+        sha256 = "2b28f9c9f063b96825a96d56d7f7978c9a1c55c9b25175c20dd49a8a77cb0305",
+        strip_prefix = "ngraph-0.9.1",
         urls = [
-            "https://mirror.bazel.build/github.com/NervanaSystems/ngraph/archive/v0.8.1.tar.gz",
-            "https://github.com/NervanaSystems/ngraph/archive/v0.8.1.tar.gz",
+            "https://mirror.bazel.build/github.com/NervanaSystems/ngraph/archive/v0.9.1.tar.gz",
+            "https://github.com/NervanaSystems/ngraph/archive/v0.9.1.tar.gz",
         ],
     )
 
@@ -827,11 +828,11 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
     tf_http_archive(
         name = "ngraph_tf",
         build_file = clean_dep("//third_party/ngraph:ngraph_tf.BUILD"),
-        sha256 = "402f84c748c113780a60f35f39aab118435285543aee4900d712b76fbf8a21ee",
-        strip_prefix = "ngraph-tf-0.6.1",
+        sha256 = "89accbc702e68a09775f1011a99dd16561038fd1ce59d566d64450176abaae5c",
+        strip_prefix = "ngraph-tf-0.7.0",
         urls = [
-            "https://mirror.bazel.build/github.com/NervanaSystems/ngraph-tf/archive/v0.6.1.tar.gz",
-            "https://github.com/NervanaSystems/ngraph-tf/archive/v0.6.1.tar.gz",
+            "https://mirror.bazel.build/github.com/NervanaSystems/ngraph-tf/archive/v0.7.0.tar.gz",
+            "https://github.com/NervanaSystems/ngraph-tf/archive/v0.7.0.tar.gz",
         ],
     )
 
diff --git a/third_party/clang_toolchain/download_clang.bzl b/third_party/clang_toolchain/download_clang.bzl
index 5fa459caf15..9023e250b2f 100644
--- a/third_party/clang_toolchain/download_clang.bzl
+++ b/third_party/clang_toolchain/download_clang.bzl
@@ -39,15 +39,15 @@ def download_clang(repo_ctx, out_folder):
 
     # Latest CLANG_REVISION and CLANG_SUB_REVISION of the Chromiums's release
     # can be found in https://chromium.googlesource.com/chromium/src/tools/clang/+/master/scripts/update.py
-    CLANG_REVISION = "343880"
+    CLANG_REVISION = "346388"
     CLANG_SUB_REVISION = 1
 
     package_version = "%s-%s" % (CLANG_REVISION, CLANG_SUB_REVISION)
 
     checksums = {
-        "Linux_x64": "3530f53516fd08799e2754601e53a19531e1db5bc73c9ad8d2d1d8efdd9c9c9b",
-        "Mac": "8761b47869089be216324af8c5a93cba2d539a1d252c9c8cad8f2cd6da21f9f4",
-        "Win": "06eb08aa0b1ff7ea65db375a7dc7151cde7c89a44044fb63e5b73ea2f96c6e65",
+        "Linux_x64": "5e5564e4e743414c7eaec9fd9e739732ddd2a343e49bde4c88fc2530b1c598b9",
+        "Mac": "19271a7cc5c2bcaf9643d3dd622b5458569dc662bbc58f63b129cf6e3a4e3243",
+        "Win": "60b0bd1f11e53892109f4159e2aba0f803604823e07875ca98b82bd5628d7f4d",
     }
 
     platform_folder = _get_platform_folder(repo_ctx.os.name)
diff --git a/third_party/eigen.BUILD b/third_party/eigen.BUILD
index 759f8a9be92..194a2272d54 100644
--- a/third_party/eigen.BUILD
+++ b/third_party/eigen.BUILD
@@ -65,6 +65,7 @@ cc_library(
         # code. We use it, but we do not rely on it, as evidenced above.
         "EIGEN_MPL2_ONLY",
         "EIGEN_MAX_ALIGN_BYTES=64",
+        "EIGEN_HAS_TYPE_TRAITS=0",
     ],
     includes = ["."],
     visibility = ["//visibility:public"],
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/FixedPointTypes.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/FixedPointTypes.h
index 5ab36649187..ff359cedced 100644
--- a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/FixedPointTypes.h
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/FixedPointTypes.h
@@ -249,9 +249,7 @@ EIGEN_STRONG_INLINE QInt32& operator/=(QInt32& a, const QInt32 b) {
   a.value /= b.value;
   return a;
 }
-EIGEN_STRONG_INLINE QInt32 operator-(const QInt32 a) {
-  return -a.value;
-}
+EIGEN_STRONG_INLINE QInt32 operator-(const QInt32 a) { return -a.value; }
 
 // Scaling QInt32 by double. We do the arithmetic in double because
 // float only has 23 bits of mantissa, so casting QInt32 to float might reduce
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProduct.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProduct.h
index e6f4080ae12..8477933e1ba 100644
--- a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProduct.h
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProduct.h
@@ -15,11 +15,9 @@ namespace internal {
 
 // Accumulate the product of 2 QInt8 inputs on 32 bits to prevent
 // overflows
-template<> struct scalar_product_traits<QInt8, QInt8>
-{
-  enum {
-    Defined = 1
-  };
+template <>
+struct scalar_product_traits<QInt8, QInt8> {
+  enum { Defined = 1 };
   typedef QInt32 ReturnType;
 };
 
@@ -33,11 +31,9 @@ struct scalar_product_traits<QInt16, QInt16> {
 
 // Accumulate the product of QInt8 inputs with QUint8 inputs on 32 bits
 // to prevent overflows
-template<> struct scalar_product_traits<QInt8, QUInt8>
-{
-  enum {
-    Defined = 1
-  };
+template <>
+struct scalar_product_traits<QInt8, QUInt8> {
+  enum { Defined = 1 };
   typedef QInt32 ReturnType;
 };
 
@@ -47,14 +43,16 @@ template<> struct scalar_product_traits<QInt8, QUInt8>
 // signed 8bit integers
 #ifndef EIGEN_USE_OPTIMIZED_INT8_INT8_MAT_MAT_PRODUCT
 
-template<bool _ConjLhs, bool _ConjRhs>
-class gebp_traits<QInt8, QInt8, _ConjLhs, _ConjRhs>
-{
-public:
+template <bool _ConjLhs, bool _ConjRhs>
+class gebp_traits<QInt8, QInt8, _ConjLhs, _ConjRhs> {
+ public:
   typedef QInt8 LhsScalar;
   typedef QInt8 RhsScalar;
   typedef QInt32 ResScalar;
 
+  typedef typename packet_traits<LhsScalar>::type LhsPacket;
+  typedef LhsPacket LhsPacket4Packing;
+
   enum {
     // register block size along the M and N directions
     // One for the current implementation
@@ -68,22 +66,24 @@ public:
 };
 
 // The signed 8bit Mat-Mat product itself.
-template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
-struct gebp_kernel<QInt8, QInt8, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
-{
+template <typename Index, typename DataMapper, int mr, int nr,
+          bool ConjugateLhs, bool ConjugateRhs>
+struct gebp_kernel<QInt8, QInt8, Index, DataMapper, mr, nr, ConjugateLhs,
+                   ConjugateRhs> {
   EIGEN_DONT_INLINE
-  void operator()(const DataMapper& res, const QInt8* blockA, const QInt8* blockB,
-                  Index rows, Index depth, Index cols, QInt32 alpha,
-                  Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
+  void operator()(const DataMapper& res, const QInt8* blockA,
+                  const QInt8* blockB, Index rows, Index depth, Index cols,
+                  QInt32 alpha, Index strideA = -1, Index strideB = -1,
+                  Index offsetA = 0, Index offsetB = 0);
 };
 
-template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
-EIGEN_DONT_INLINE
-void gebp_kernel<QInt8, QInt8, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
-::operator()(const DataMapper& res, const QInt8* blockA, const QInt8* blockB,
-             Index rows, Index depth, Index cols, QInt32 alpha,
-             Index strideA, Index strideB, Index offsetA, Index offsetB)
-{
+template <typename Index, typename DataMapper, int mr, int nr,
+          bool ConjugateLhs, bool ConjugateRhs>
+EIGEN_DONT_INLINE void gebp_kernel<QInt8, QInt8, Index, DataMapper, mr, nr,
+                                   ConjugateLhs, ConjugateRhs>::
+operator()(const DataMapper& res, const QInt8* blockA, const QInt8* blockB,
+           Index rows, Index depth, Index cols, QInt32 alpha, Index strideA,
+           Index strideB, Index offsetA, Index offsetB) {
   EIGEN_STATIC_ASSERT(!ConjugateLhs, YOU_MADE_A_PROGRAMMING_MISTAKE);
   EIGEN_STATIC_ASSERT(!ConjugateRhs, YOU_MADE_A_PROGRAMMING_MISTAKE);
 
@@ -113,18 +113,19 @@ void gebp_kernel<QInt8, QInt8, Index, DataMapper, mr, nr, ConjugateLhs, Conjugat
 }
 #endif
 
-
 // This definition tackle the case where the lhs is encoded using signed 8bit
 // integers and the rhs using unsigned 8bit integers.
 #ifndef EIGEN_USE_OPTIMIZED_INT8_UINT8_MAT_MAT_PRODUCT
-template<bool _ConjLhs, bool _ConjRhs>
-class gebp_traits<QInt8, QUInt8, _ConjLhs, _ConjRhs>
-{
-public:
+template <bool _ConjLhs, bool _ConjRhs>
+class gebp_traits<QInt8, QUInt8, _ConjLhs, _ConjRhs> {
+ public:
   typedef QInt8 LhsScalar;
   typedef QUInt8 RhsScalar;
   typedef QInt32 ResScalar;
 
+  typedef typename packet_traits<LhsScalar>::type LhsPacket;
+  typedef LhsPacket LhsPacket4Packing;
+
   enum {
     // register block size along the M and N directions
     // One for the current implementation
@@ -138,22 +139,24 @@ public:
 };
 
 // Mat-Mat product of a signed 8bit lhs with an unsigned 8bit rhs
-template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
-struct gebp_kernel<QInt8, QUInt8, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
-{
+template <typename Index, typename DataMapper, int mr, int nr,
+          bool ConjugateLhs, bool ConjugateRhs>
+struct gebp_kernel<QInt8, QUInt8, Index, DataMapper, mr, nr, ConjugateLhs,
+                   ConjugateRhs> {
   EIGEN_DONT_INLINE
-  void operator()(const DataMapper& res, const QInt8* blockA, const QUInt8* blockB,
-                  Index rows, Index depth, Index cols, QInt32 alpha,
-                  Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
+  void operator()(const DataMapper& res, const QInt8* blockA,
+                  const QUInt8* blockB, Index rows, Index depth, Index cols,
+                  QInt32 alpha, Index strideA = -1, Index strideB = -1,
+                  Index offsetA = 0, Index offsetB = 0);
 };
 
-template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
-EIGEN_DONT_INLINE
-void gebp_kernel<QInt8, QUInt8, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
-::operator()(const DataMapper& res, const QInt8* blockA, const QUInt8* blockB,
-             Index rows, Index depth, Index cols, QInt32 alpha,
-             Index strideA, Index strideB, Index offsetA, Index offsetB)
-{
+template <typename Index, typename DataMapper, int mr, int nr,
+          bool ConjugateLhs, bool ConjugateRhs>
+EIGEN_DONT_INLINE void gebp_kernel<QInt8, QUInt8, Index, DataMapper, mr, nr,
+                                   ConjugateLhs, ConjugateRhs>::
+operator()(const DataMapper& res, const QInt8* blockA, const QUInt8* blockB,
+           Index rows, Index depth, Index cols, QInt32 alpha, Index strideA,
+           Index strideB, Index offsetA, Index offsetB) {
   EIGEN_STATIC_ASSERT(!ConjugateLhs, YOU_MADE_A_PROGRAMMING_MISTAKE);
   EIGEN_STATIC_ASSERT(!ConjugateRhs, YOU_MADE_A_PROGRAMMING_MISTAKE);
 
@@ -183,18 +186,19 @@ void gebp_kernel<QInt8, QUInt8, Index, DataMapper, mr, nr, ConjugateLhs, Conjuga
 }
 #endif
 
-
 // This definition tackle the case where the khs is encoded using unsigned 8bit
 // integers and the rhs using signed 8bit integers.
 #ifndef EIGEN_USE_OPTIMIZED_UINT8_INT8_MAT_MAT_PRODUCT
-template<bool _ConjLhs, bool _ConjRhs>
-class gebp_traits<QUInt8, QInt8, _ConjLhs, _ConjRhs>
-{
-public:
+template <bool _ConjLhs, bool _ConjRhs>
+class gebp_traits<QUInt8, QInt8, _ConjLhs, _ConjRhs> {
+ public:
   typedef QUInt8 LhsScalar;
   typedef QInt8 RhsScalar;
   typedef QInt32 ResScalar;
 
+  typedef typename packet_traits<LhsScalar>::type LhsPacket;
+  typedef LhsPacket LhsPacket4Packing;
+
   enum {
     // register block size along the M and N directions
     // One for the current implementation
@@ -207,24 +211,25 @@ public:
   };
 };
 
-
 // Mat-Mat product of an unsigned 8bit lhs with a signed 8bit rhs
-template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
-struct gebp_kernel<QUInt8, QInt8, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
-{
+template <typename Index, typename DataMapper, int mr, int nr,
+          bool ConjugateLhs, bool ConjugateRhs>
+struct gebp_kernel<QUInt8, QInt8, Index, DataMapper, mr, nr, ConjugateLhs,
+                   ConjugateRhs> {
   EIGEN_DONT_INLINE
-  void operator()(const DataMapper& res, const QUInt8* blockA, const QInt8* blockB,
-                  Index rows, Index depth, Index cols, QInt32 alpha,
-                  Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
+  void operator()(const DataMapper& res, const QUInt8* blockA,
+                  const QInt8* blockB, Index rows, Index depth, Index cols,
+                  QInt32 alpha, Index strideA = -1, Index strideB = -1,
+                  Index offsetA = 0, Index offsetB = 0);
 };
 
-template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
-EIGEN_DONT_INLINE
-void gebp_kernel<QUInt8, QInt8, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
-::operator()(const DataMapper& res, const QUInt8* blockA, const QInt8* blockB,
-             Index rows, Index depth, Index cols, QInt32 alpha,
-             Index strideA, Index strideB, Index offsetA, Index offsetB)
-{
+template <typename Index, typename DataMapper, int mr, int nr,
+          bool ConjugateLhs, bool ConjugateRhs>
+EIGEN_DONT_INLINE void gebp_kernel<QUInt8, QInt8, Index, DataMapper, mr, nr,
+                                   ConjugateLhs, ConjugateRhs>::
+operator()(const DataMapper& res, const QUInt8* blockA, const QInt8* blockB,
+           Index rows, Index depth, Index cols, QInt32 alpha, Index strideA,
+           Index strideB, Index offsetA, Index offsetB) {
   EIGEN_STATIC_ASSERT(!ConjugateLhs, YOU_MADE_A_PROGRAMMING_MISTAKE);
   EIGEN_STATIC_ASSERT(!ConjugateRhs, YOU_MADE_A_PROGRAMMING_MISTAKE);
 
@@ -263,6 +268,9 @@ class gebp_traits<QInt16, QInt16, _ConjLhs, _ConjRhs> {
   typedef QInt16 RhsScalar;
   typedef QInt32 ResScalar;
 
+  typedef typename packet_traits<LhsScalar>::type LhsPacket;
+  typedef LhsPacket LhsPacket4Packing;
+
   enum {
     // register block size along the M and N directions
     // One for the current implementation
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProductAVX2.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProductAVX2.h
index 66532fb6002..8547dca1b32 100644
--- a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProductAVX2.h
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProductAVX2.h
@@ -28,6 +28,9 @@ class gebp_traits<QInt16, QInt16, _ConjLhs, _ConjRhs> {
   typedef QInt16 RhsScalar;
   typedef QInt32 ResScalar;
 
+  typedef typename packet_traits<LhsScalar>::type LhsPacket;
+  typedef LhsPacket LhsPacket4Packing;
+
   enum {
     // Define register blocking scheme.
     nr = 16,
@@ -43,7 +46,7 @@ class gebp_traits<QInt16, QInt16, _ConjLhs, _ConjRhs> {
 // Used by TensorContractionThreadPool, inputs must have dimensions that are
 // multiples of 32.
 template <typename Index, int ShardingType>
-class TensorContractionBlocking<QInt16, QInt16, Index, ShardingType> {
+class TensorContractionBlocking<QInt16, QInt16, QInt16, Index, ShardingType> {
  public:
   TensorContractionBlocking(Index k, Index m, Index n, Index num_threads = 1)
       : kc_(((k + 15) / 16) * 16),
@@ -144,7 +147,7 @@ class gemm_blocking_space<ColMajor, QInt16, QInt16, MaxRows, MaxCols, MaxDepth,
 
 template <typename Index, typename DataMapper, int Pack1, int Pack2,
           bool Conjugate, bool PanelMode>
-struct gemm_pack_lhs<QInt16, Index, DataMapper, Pack1, Pack2, ColMajor,
+struct gemm_pack_lhs<QInt16, Index, DataMapper, Pack1, Pack2, QInt16, ColMajor,
                      Conjugate, PanelMode> {
   EIGEN_DONT_INLINE void operator()(QInt16* blockA, const DataMapper& lhs,
                                     Index depth, Index rows, Index stride = 0,
@@ -154,12 +157,14 @@ struct gemm_pack_lhs<QInt16, Index, DataMapper, Pack1, Pack2, ColMajor,
 template <typename Index, typename DataMapper, int Pack1, int Pack2,
           bool Conjugate, bool PanelMode>
 EIGEN_DONT_INLINE void gemm_pack_lhs<QInt16, Index, DataMapper, Pack1, Pack2,
-                                     ColMajor, Conjugate, PanelMode>::
+                                     QInt16, ColMajor, Conjugate, PanelMode>::
 operator()(QInt16* blockA, const DataMapper& lhs, Index depth, Index rows,
            Index stride, Index offset) {
   eigen_assert(stride == 0);
   eigen_assert(offset == 0);
 
+  typedef typename packet_traits<QInt16>::type Packet;
+
   // Use alternate function for weird sizes
   if (rows % 16 != 0 || depth % 16 != 0) {
     assert(false &&
@@ -178,10 +183,10 @@ operator()(QInt16* blockA, const DataMapper& lhs, Index depth, Index rows,
     // Pack depth in sets of 4
     for (Index k = 0; k < depth; k += 4) {
       // Load vectors
-      __m256i L_A = lhs.loadPacket(m, k);
-      __m256i L_B = lhs.loadPacket(m, k + 1);
-      __m256i L_C = lhs.loadPacket(m, k + 2);
-      __m256i L_D = lhs.loadPacket(m, k + 3);
+      __m256i L_A = lhs.template loadPacket<Packet>(m, k);
+      __m256i L_B = lhs.template loadPacket<Packet>(m, k + 1);
+      __m256i L_C = lhs.template loadPacket<Packet>(m, k + 2);
+      __m256i L_D = lhs.template loadPacket<Packet>(m, k + 3);
 
       // Rearrange the inputs as required by the kernel
       __m256i L_AB0_AB7 = _mm256_unpacklo_epi16(L_A, L_B);
@@ -236,13 +241,15 @@ struct gemm_pack_rhs<QInt16, Index, DataMapper, nr, ColMajor, Conjugate,
 
 template <typename Index, typename DataMapper, int nr, bool Conjugate,
           bool PanelMode>
-EIGEN_DONT_INLINE void
-gemm_pack_rhs<QInt16, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>::
+EIGEN_DONT_INLINE void gemm_pack_rhs<QInt16, Index, DataMapper, nr, ColMajor,
+                                     Conjugate, PanelMode>::
 operator()(QInt16* blockB, const DataMapper& rhs, Index depth, Index cols,
            Index stride, Index offset) {
   eigen_assert(stride == 0);
   eigen_assert(offset == 0);
 
+  typedef typename packet_traits<QInt16>::type Packet;
+
   // Use alternate function for weird sizes
   if (cols % 16 != 0 || depth % 16 != 0) {
     assert(false &&
@@ -277,28 +284,28 @@ operator()(QInt16* blockB, const DataMapper& rhs, Index depth, Index cols,
   for (Index n = 0; n < cols; n += 16) {
     // Pack depth in sets of 16
     for (Index k = 0; k < depth; k += 16) {
-      __m256i R_A = rhs.loadPacket(k, n);
-      __m256i R_B = rhs.loadPacket(k, n + 1);
-      __m256i R_C = rhs.loadPacket(k, n + 2);
-      __m256i R_D = rhs.loadPacket(k, n + 3);
+      __m256i R_A = rhs.template loadPacket<Packet>(k, n);
+      __m256i R_B = rhs.template loadPacket<Packet>(k, n + 1);
+      __m256i R_C = rhs.template loadPacket<Packet>(k, n + 2);
+      __m256i R_D = rhs.template loadPacket<Packet>(k, n + 3);
       PACK_STEP;
 
-      R_A = rhs.loadPacket(k, n + 4);
-      R_B = rhs.loadPacket(k, n + 5);
-      R_C = rhs.loadPacket(k, n + 6);
-      R_D = rhs.loadPacket(k, n + 7);
+      R_A = rhs.template loadPacket<Packet>(k, n + 4);
+      R_B = rhs.template loadPacket<Packet>(k, n + 5);
+      R_C = rhs.template loadPacket<Packet>(k, n + 6);
+      R_D = rhs.template loadPacket<Packet>(k, n + 7);
       PACK_STEP;
 
-      R_A = rhs.loadPacket(k, n + 8);
-      R_B = rhs.loadPacket(k, n + 9);
-      R_C = rhs.loadPacket(k, n + 10);
-      R_D = rhs.loadPacket(k, n + 11);
+      R_A = rhs.template loadPacket<Packet>(k, n + 8);
+      R_B = rhs.template loadPacket<Packet>(k, n + 9);
+      R_C = rhs.template loadPacket<Packet>(k, n + 10);
+      R_D = rhs.template loadPacket<Packet>(k, n + 11);
       PACK_STEP;
 
-      R_A = rhs.loadPacket(k, n + 12);
-      R_B = rhs.loadPacket(k, n + 13);
-      R_C = rhs.loadPacket(k, n + 14);
-      R_D = rhs.loadPacket(k, n + 15);
+      R_A = rhs.template loadPacket<Packet>(k, n + 12);
+      R_B = rhs.template loadPacket<Packet>(k, n + 13);
+      R_C = rhs.template loadPacket<Packet>(k, n + 14);
+      R_D = rhs.template loadPacket<Packet>(k, n + 15);
       PACK_STEP;
 
       blockB_256 += 12;
@@ -476,9 +483,13 @@ operator()(const DataMapper& res, const QInt16* blockA, const QInt16* blockB,
       for (Index j = n; j < n + 16; j++) {
         LinearMapper r0 = res.getLinearMapper(m, j);
         LinearMapper r1 = res.getLinearMapper(m + 8, j);
-
-        r0.storePacket(0, _mm256_add_epi32(blockO_256[i++], r0.loadPacket(0)));
-        r1.storePacket(0, _mm256_add_epi32(blockO_256[i++], r1.loadPacket(0)));
+        typedef typename packet_traits<QInt32>::type Packet;
+        r0.template storePacket<Packet>(
+            0, _mm256_add_epi32(blockO_256[i++],
+                                r0.template loadPacket<Packet>(0)));
+        r1.template storePacket<Packet>(
+            0, _mm256_add_epi32(blockO_256[i++],
+                                r1.template loadPacket<Packet>(0)));
       }
 
       // Zero the result block so it can be reused
@@ -496,14 +507,16 @@ operator()(const DataMapper& res, const QInt16* blockA, const QInt16* blockB,
 #ifdef EIGEN_USE_OPTIMIZED_INT8_UINT8_MAT_MAT_PRODUCT
 
 // Define quantized traits
-template<bool _ConjLhs, bool _ConjRhs>
-class gebp_traits<QInt8, QUInt8, _ConjLhs, _ConjRhs>
-{
-public:
+template <bool _ConjLhs, bool _ConjRhs>
+class gebp_traits<QInt8, QUInt8, _ConjLhs, _ConjRhs> {
+ public:
   typedef QInt8 LhsScalar;
   typedef QUInt8 RhsScalar;
   typedef QInt32 ResScalar;
 
+  typedef typename packet_traits<LhsScalar>::type LhsPacket;
+  typedef LhsPacket LhsPacket4Packing;
+
   enum {
     // Define register blocking scheme.
     nr = 32,
@@ -518,22 +531,28 @@ public:
 // Specialized blocking for quantized implementations.
 // Used by TensorContractionThreadPool, inputs must have dimensions that are
 // multiples of 32.
-template<typename Index,
-         typename LeftTensor,
-         typename left_nocontract_t, typename left_contract_t,
-         bool left_inner_dim_contiguous, bool left_inner_dim_reordered, int LeftAlignment,
-         typename RightTensor,
-         typename right_nocontract_t, typename right_contract_t,
-         bool right_inner_dim_contiguous, bool right_inner_dim_reordered, int RightAlignment, int ShardingType>
-class TensorContractionBlocking<TensorContractionInputMapper<QInt8, Index, Lhs, LeftTensor, left_nocontract_t, left_contract_t, 32, left_inner_dim_contiguous, left_inner_dim_reordered, LeftAlignment>, TensorContractionInputMapper<QUInt8, Index, Rhs, RightTensor, right_nocontract_t, right_contract_t, 32, right_inner_dim_contiguous, right_inner_dim_reordered, RightAlignment>, Index, ShardingType> {
+template <typename ResScalar, typename Index, typename LeftTensor,
+          typename left_nocontract_t, typename left_contract_t,
+          bool left_inner_dim_contiguous, bool left_inner_dim_reordered,
+          int LeftAlignment, typename RightTensor, typename right_nocontract_t,
+          typename right_contract_t, bool right_inner_dim_contiguous,
+          bool right_inner_dim_reordered, int RightAlignment, int ShardingType>
+class TensorContractionBlocking<
+    ResScalar,
+    TensorContractionInputMapper<
+        QInt8, Index, Lhs, LeftTensor, left_nocontract_t, left_contract_t, 32,
+        left_inner_dim_contiguous, left_inner_dim_reordered, LeftAlignment>,
+    TensorContractionInputMapper<QUInt8, Index, Rhs, RightTensor,
+                                 right_nocontract_t, right_contract_t, 32,
+                                 right_inner_dim_contiguous,
+                                 right_inner_dim_reordered, RightAlignment>,
+    Index, ShardingType> {
  public:
-
-  typedef QInt8  LhsScalar;
+  typedef QInt8 LhsScalar;
   typedef QUInt8 RhsScalar;
 
-  TensorContractionBlocking(Index k, Index m, Index n, Index num_threads = 1) :
-      kc_(k), mc_(m), nc_(n)
-  {
+  TensorContractionBlocking(Index k, Index m, Index n, Index num_threads = 1)
+      : kc_(k), mc_(m), nc_(n) {
     eigen_assert(m % 32 == 0);
     eigen_assert(k % 32 == 0);
     if (!k || !m || !n) {
@@ -543,8 +562,7 @@ class TensorContractionBlocking<TensorContractionInputMapper<QInt8, Index, Lhs,
     if (ShardingType == ShardByCol) {
       eigen_assert(n % 32 == 0);
       nc_ = (((n / num_threads) + 31) / 32) * 32;
-    }
-    else {
+    } else {
       eigen_assert(n % 32 == 0 || n == 1);
       // Special case to avoid breaking the unimplemented matrix-vector case
       if (n == 1) {
@@ -599,7 +617,6 @@ class gemm_blocking_space<ColMajor, QInt8, QInt8, MaxRows, MaxCols, MaxDepth,
   }
 };
 
-
 template <int MaxRows, int MaxCols, int MaxDepth, int KcFactor>
 class gemm_blocking_space<ColMajor, QInt8, QUInt8, MaxRows, MaxCols, MaxDepth,
                           KcFactor, false>
@@ -633,42 +650,60 @@ class gemm_blocking_space<ColMajor, QInt8, QUInt8, MaxRows, MaxCols, MaxDepth,
 };
 
 // Alternate templates for any input sizes
-template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, int StorageOrder, bool Conjugate = false, bool PanelMode = false>
+template <typename Scalar, typename Index, typename DataMapper, int Pack1,
+          int Pack2, int StorageOrder, bool Conjugate = false,
+          bool PanelMode = false>
 struct gemm_pack_lhs_any;
-template <typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
-struct gemm_pack_lhs_any<QInt8, Index, DataMapper, Pack1, Pack2, ColMajor, Conjugate, PanelMode> {
-  EIGEN_DONT_INLINE void operator()
-      (QInt8* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride = 0, Index offset = 0);
+template <typename Index, typename DataMapper, int Pack1, int Pack2,
+          bool Conjugate, bool PanelMode>
+struct gemm_pack_lhs_any<QInt8, Index, DataMapper, Pack1, Pack2, ColMajor,
+                         Conjugate, PanelMode> {
+  EIGEN_DONT_INLINE void operator()(QInt8* blockA, const DataMapper& lhs,
+                                    Index depth, Index rows, Index stride = 0,
+                                    Index offset = 0);
 };
 
-template<typename Scalar, typename Index, typename DataMapper, int nr, int StorageOrder, bool Conjugate = false, bool PanelMode=false>
+template <typename Scalar, typename Index, typename DataMapper, int nr,
+          int StorageOrder, bool Conjugate = false, bool PanelMode = false>
 struct gemm_pack_rhs_any;
-template <typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
-struct gemm_pack_rhs_any<QUInt8, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode> {
-  EIGEN_DONT_INLINE void operator()
-      (QUInt8* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride = 0, Index offset = 0);
+template <typename Index, typename DataMapper, int nr, bool Conjugate,
+          bool PanelMode>
+struct gemm_pack_rhs_any<QUInt8, Index, DataMapper, nr, ColMajor, Conjugate,
+                         PanelMode> {
+  EIGEN_DONT_INLINE void operator()(QUInt8* blockB, const DataMapper& rhs,
+                                    Index depth, Index cols, Index stride = 0,
+                                    Index offset = 0);
 };
 
-template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs=false, bool ConjugateRhs=false>
+template <typename LhsScalar, typename RhsScalar, typename Index,
+          typename DataMapper, int mr, int nr, bool ConjugateLhs = false,
+          bool ConjugateRhs = false>
 struct gebp_kernel_any;
-template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
-struct gebp_kernel_any<QInt8, QUInt8, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
-{
+template <typename Index, typename DataMapper, int mr, int nr,
+          bool ConjugateLhs, bool ConjugateRhs>
+struct gebp_kernel_any<QInt8, QUInt8, Index, DataMapper, mr, nr, ConjugateLhs,
+                       ConjugateRhs> {
   typedef typename DataMapper::LinearMapper LinearMapper;
 
   EIGEN_DONT_INLINE
-  void operator()(const DataMapper& res, const QInt8* blockA, const QUInt8* blockB,
-                  Index rows, Index depth, Index cols, QInt32 alpha,
-                  Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
+  void operator()(const DataMapper& res, const QInt8* blockA,
+                  const QUInt8* blockB, Index rows, Index depth, Index cols,
+                  QInt32 alpha, Index strideA = -1, Index strideB = -1,
+                  Index offsetA = 0, Index offsetB = 0);
 };
 
 // Alternate implementations for any input sizes
-template <typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
-EIGEN_DONT_INLINE void gemm_pack_lhs_any<QInt8, Index, DataMapper, Pack1, Pack2, ColMajor, Conjugate, PanelMode>::
-operator()(QInt8* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset) {
+template <typename Index, typename DataMapper, int Pack1, int Pack2,
+          bool Conjugate, bool PanelMode>
+EIGEN_DONT_INLINE void gemm_pack_lhs_any<QInt8, Index, DataMapper, Pack1, Pack2,
+                                         ColMajor, Conjugate, PanelMode>::
+operator()(QInt8* blockA, const DataMapper& lhs, Index depth, Index rows,
+           Index stride, Index offset) {
   eigen_assert(stride == 0);
   eigen_assert(offset == 0);
 
+  typedef typename packet_traits<QInt8>::type Packet;
+
   // Get vector pointer
   __m256i* blockA_256 = reinterpret_cast<__m256i*>(blockA);
 
@@ -690,15 +725,15 @@ operator()(QInt8* blockA, const DataMapper& lhs, Index depth, Index rows, Index
     // Pack depth in sets of 8
     for (Index k = 0; k < depth_8; k += 8) {
       // Load vectors
-      __m256i L_A = lhs.loadPacket(m, k);
-      __m256i L_B = lhs.loadPacket(m, k + 1);
+      __m256i L_A = lhs.template loadPacket<Packet>(m, k);
+      __m256i L_B = lhs.template loadPacket<Packet>(m, k + 1);
 
       // Interleave 8-bit elements
       __m256i L_AB0_AB16 = _mm256_unpacklo_epi8(L_A, L_B);
       __m256i L_AB8_AB24 = _mm256_unpackhi_epi8(L_A, L_B);
 
-      __m256i L_C = lhs.loadPacket(m, k + 2);
-      __m256i L_D = lhs.loadPacket(m, k + 3);
+      __m256i L_C = lhs.template loadPacket<Packet>(m, k + 2);
+      __m256i L_D = lhs.template loadPacket<Packet>(m, k + 3);
       __m256i L_CD0_CD16 = _mm256_unpacklo_epi8(L_C, L_D);
       __m256i L_CD8_CD24 = _mm256_unpackhi_epi8(L_C, L_D);
 
@@ -719,12 +754,12 @@ operator()(QInt8* blockA, const DataMapper& lhs, Index depth, Index rows, Index
       _mm256_store_si256(blockA_256++, L_AD16);
       __m256i L_AD24 = _mm256_permute2x128_si256(L_AD8_AD24, L_AD12_AD28, 0x31);
       _mm256_store_si256(blockA_256++, L_AD24);
-      __m256i L_E = lhs.loadPacket(m, k + 4);
-      __m256i L_F = lhs.loadPacket(m, k + 5);
+      __m256i L_E = lhs.template loadPacket<Packet>(m, k + 4);
+      __m256i L_F = lhs.template loadPacket<Packet>(m, k + 5);
       __m256i L_EF0_EF16 = _mm256_unpacklo_epi8(L_E, L_F);
       __m256i L_EF8_EF24 = _mm256_unpackhi_epi8(L_E, L_F);
-      __m256i L_G = lhs.loadPacket(m, k + 6);
-      __m256i L_H = lhs.loadPacket(m, k + 7);
+      __m256i L_G = lhs.template loadPacket<Packet>(m, k + 6);
+      __m256i L_H = lhs.template loadPacket<Packet>(m, k + 7);
       __m256i L_GH0_GH16 = _mm256_unpacklo_epi8(L_G, L_H);
       __m256i L_GH8_GH24 = _mm256_unpackhi_epi8(L_G, L_H);
       __m256i L_EH0_EH16 = _mm256_unpacklo_epi16(L_EF0_EF16, L_GH0_GH16);
@@ -745,76 +780,76 @@ operator()(QInt8* blockA, const DataMapper& lhs, Index depth, Index rows, Index
     if (depth_8 < depth) {
       __m256i L_A, L_B, L_C, L_D, L_E, L_F, L_G, L_H;
       switch (depth - depth_8) {
-      case 1:
-        L_A = lhs.loadPacket(m, depth_8);
-        L_B = _mm256_setzero_si256();
-        L_C = _mm256_setzero_si256();
-        L_D = _mm256_setzero_si256();
-        L_E = _mm256_setzero_si256();
-        L_F = _mm256_setzero_si256();
-        L_G = _mm256_setzero_si256();
-        L_H = _mm256_setzero_si256();
-        break;
-      case 2:
-        L_A = lhs.loadPacket(m, depth_8);
-        L_B = lhs.loadPacket(m, depth_8 + 1);
-        L_C = _mm256_setzero_si256();
-        L_D = _mm256_setzero_si256();
-        L_E = _mm256_setzero_si256();
-        L_F = _mm256_setzero_si256();
-        L_G = _mm256_setzero_si256();
-        L_H = _mm256_setzero_si256();
-        break;
-      case 3:
-        L_A = lhs.loadPacket(m, depth_8);
-        L_B = lhs.loadPacket(m, depth_8 + 1);
-        L_C = lhs.loadPacket(m, depth_8 + 2);
-        L_D = _mm256_setzero_si256();
-        L_E = _mm256_setzero_si256();
-        L_F = _mm256_setzero_si256();
-        L_G = _mm256_setzero_si256();
-        L_H = _mm256_setzero_si256();
-        break;
-      case 4:
-        L_A = lhs.loadPacket(m, depth_8);
-        L_B = lhs.loadPacket(m, depth_8 + 1);
-        L_C = lhs.loadPacket(m, depth_8 + 2);
-        L_D = lhs.loadPacket(m, depth_8 + 3);
-        L_E = _mm256_setzero_si256();
-        L_F = _mm256_setzero_si256();
-        L_G = _mm256_setzero_si256();
-        L_H = _mm256_setzero_si256();
-        break;
-      case 5:
-        L_A = lhs.loadPacket(m, depth_8);
-        L_B = lhs.loadPacket(m, depth_8 + 1);
-        L_C = lhs.loadPacket(m, depth_8 + 2);
-        L_D = lhs.loadPacket(m, depth_8 + 3);
-        L_E = lhs.loadPacket(m, depth_8 + 4);
-        L_F = _mm256_setzero_si256();
-        L_G = _mm256_setzero_si256();
-        L_H = _mm256_setzero_si256();
-        break;
-      case 6:
-        L_A = lhs.loadPacket(m, depth_8);
-        L_B = lhs.loadPacket(m, depth_8 + 1);
-        L_C = lhs.loadPacket(m, depth_8 + 2);
-        L_D = lhs.loadPacket(m, depth_8 + 3);
-        L_E = lhs.loadPacket(m, depth_8 + 4);
-        L_F = lhs.loadPacket(m, depth_8 + 5);
-        L_G = _mm256_setzero_si256();
-        L_H = _mm256_setzero_si256();
-        break;
-      case 7:
-        L_A = lhs.loadPacket(m, depth_8);
-        L_B = lhs.loadPacket(m, depth_8 + 1);
-        L_C = lhs.loadPacket(m, depth_8 + 2);
-        L_D = lhs.loadPacket(m, depth_8 + 3);
-        L_E = lhs.loadPacket(m, depth_8 + 4);
-        L_F = lhs.loadPacket(m, depth_8 + 5);
-        L_G = lhs.loadPacket(m, depth_8 + 6);
-        L_H = _mm256_setzero_si256();
-        break;
+        case 1:
+          L_A = lhs.template loadPacket<Packet>(m, depth_8);
+          L_B = _mm256_setzero_si256();
+          L_C = _mm256_setzero_si256();
+          L_D = _mm256_setzero_si256();
+          L_E = _mm256_setzero_si256();
+          L_F = _mm256_setzero_si256();
+          L_G = _mm256_setzero_si256();
+          L_H = _mm256_setzero_si256();
+          break;
+        case 2:
+          L_A = lhs.template loadPacket<Packet>(m, depth_8);
+          L_B = lhs.template loadPacket<Packet>(m, depth_8 + 1);
+          L_C = _mm256_setzero_si256();
+          L_D = _mm256_setzero_si256();
+          L_E = _mm256_setzero_si256();
+          L_F = _mm256_setzero_si256();
+          L_G = _mm256_setzero_si256();
+          L_H = _mm256_setzero_si256();
+          break;
+        case 3:
+          L_A = lhs.template loadPacket<Packet>(m, depth_8);
+          L_B = lhs.template loadPacket<Packet>(m, depth_8 + 1);
+          L_C = lhs.template loadPacket<Packet>(m, depth_8 + 2);
+          L_D = _mm256_setzero_si256();
+          L_E = _mm256_setzero_si256();
+          L_F = _mm256_setzero_si256();
+          L_G = _mm256_setzero_si256();
+          L_H = _mm256_setzero_si256();
+          break;
+        case 4:
+          L_A = lhs.template loadPacket<Packet>(m, depth_8);
+          L_B = lhs.template loadPacket<Packet>(m, depth_8 + 1);
+          L_C = lhs.template loadPacket<Packet>(m, depth_8 + 2);
+          L_D = lhs.template loadPacket<Packet>(m, depth_8 + 3);
+          L_E = _mm256_setzero_si256();
+          L_F = _mm256_setzero_si256();
+          L_G = _mm256_setzero_si256();
+          L_H = _mm256_setzero_si256();
+          break;
+        case 5:
+          L_A = lhs.template loadPacket<Packet>(m, depth_8);
+          L_B = lhs.template loadPacket<Packet>(m, depth_8 + 1);
+          L_C = lhs.template loadPacket<Packet>(m, depth_8 + 2);
+          L_D = lhs.template loadPacket<Packet>(m, depth_8 + 3);
+          L_E = lhs.template loadPacket<Packet>(m, depth_8 + 4);
+          L_F = _mm256_setzero_si256();
+          L_G = _mm256_setzero_si256();
+          L_H = _mm256_setzero_si256();
+          break;
+        case 6:
+          L_A = lhs.template loadPacket<Packet>(m, depth_8);
+          L_B = lhs.template loadPacket<Packet>(m, depth_8 + 1);
+          L_C = lhs.template loadPacket<Packet>(m, depth_8 + 2);
+          L_D = lhs.template loadPacket<Packet>(m, depth_8 + 3);
+          L_E = lhs.template loadPacket<Packet>(m, depth_8 + 4);
+          L_F = lhs.template loadPacket<Packet>(m, depth_8 + 5);
+          L_G = _mm256_setzero_si256();
+          L_H = _mm256_setzero_si256();
+          break;
+        case 7:
+          L_A = lhs.template loadPacket<Packet>(m, depth_8);
+          L_B = lhs.template loadPacket<Packet>(m, depth_8 + 1);
+          L_C = lhs.template loadPacket<Packet>(m, depth_8 + 2);
+          L_D = lhs.template loadPacket<Packet>(m, depth_8 + 3);
+          L_E = lhs.template loadPacket<Packet>(m, depth_8 + 4);
+          L_F = lhs.template loadPacket<Packet>(m, depth_8 + 5);
+          L_G = lhs.template loadPacket<Packet>(m, depth_8 + 6);
+          L_H = _mm256_setzero_si256();
+          break;
       }
 
       // Interleave 8-bit elements
@@ -875,21 +910,21 @@ operator()(QInt8* blockA, const DataMapper& lhs, Index depth, Index rows, Index
       __m256i L_G = _mm256_setzero_si256();
       __m256i L_H = _mm256_setzero_si256();
       for (Index m = 0; m < rows - rows_32; m++) {
-        QInt8* ptr = (QInt8*) &L_A;
+        QInt8* ptr = (QInt8*)&L_A;
         ptr[m] = lhs(rows_32 + m, k);
-        ptr = (QInt8*) &L_B;
+        ptr = (QInt8*)&L_B;
         ptr[m] = lhs(rows_32 + m, k + 1);
-        ptr = (QInt8*) &L_C;
+        ptr = (QInt8*)&L_C;
         ptr[m] = lhs(rows_32 + m, k + 2);
-        ptr = (QInt8*) &L_D;
+        ptr = (QInt8*)&L_D;
         ptr[m] = lhs(rows_32 + m, k + 3);
-        ptr = (QInt8*) &L_E;
+        ptr = (QInt8*)&L_E;
         ptr[m] = lhs(rows_32 + m, k + 4);
-        ptr = (QInt8*) &L_F;
+        ptr = (QInt8*)&L_F;
         ptr[m] = lhs(rows_32 + m, k + 5);
-        ptr = (QInt8*) &L_G;
+        ptr = (QInt8*)&L_G;
         ptr[m] = lhs(rows_32 + m, k + 6);
-        ptr = (QInt8*) &L_H;
+        ptr = (QInt8*)&L_H;
         ptr[m] = lhs(rows_32 + m, k + 7);
       }
 
@@ -939,146 +974,146 @@ operator()(QInt8* blockA, const DataMapper& lhs, Index depth, Index rows, Index
       __m256i L_A, L_B, L_C, L_D, L_E, L_F, L_G, L_H;
       QInt8* ptr;
       switch (depth - depth_8) {
-      case 1:
-        L_A = _mm256_setzero_si256();
-        L_B = _mm256_setzero_si256();
-        L_C = _mm256_setzero_si256();
-        L_D = _mm256_setzero_si256();
-        L_E = _mm256_setzero_si256();
-        L_F = _mm256_setzero_si256();
-        L_G = _mm256_setzero_si256();
-        L_H = _mm256_setzero_si256();
-        for (Index m = 0; m < rows - rows_32; m++) {
-          QInt8* ptr = (QInt8*) &L_A;
-          ptr[m] = lhs(rows_32 + m, depth_8);
-        }
-        break;
-      case 2:
-        L_A = _mm256_setzero_si256();
-        L_B = _mm256_setzero_si256();
-        L_C = _mm256_setzero_si256();
-        L_D = _mm256_setzero_si256();
-        L_E = _mm256_setzero_si256();
-        L_F = _mm256_setzero_si256();
-        L_G = _mm256_setzero_si256();
-        L_H = _mm256_setzero_si256();
-        for (Index m = 0; m < rows - rows_32; m++) {
-          ptr = (QInt8*) &L_A;
-          ptr[m] = lhs(rows_32 + m, depth_8);
-          ptr = (QInt8*) &L_B;
-          ptr[m] = lhs(rows_32 + m, depth_8 + 1);
-        }
-        break;
-      case 3:
-        L_A = _mm256_setzero_si256();
-        L_B = _mm256_setzero_si256();
-        L_C = _mm256_setzero_si256();
-        L_D = _mm256_setzero_si256();
-        L_E = _mm256_setzero_si256();
-        L_F = _mm256_setzero_si256();
-        L_G = _mm256_setzero_si256();
-        L_H = _mm256_setzero_si256();
-        for (Index m = 0; m < rows - rows_32; m++) {
-          ptr = (QInt8*) &L_A;
-          ptr[m] = lhs(rows_32 + m, depth_8);
-          ptr = (QInt8*) &L_B;
-          ptr[m] = lhs(rows_32 + m, depth_8 + 1);
-          ptr = (QInt8*) &L_C;
-          ptr[m] = lhs(rows_32 + m, depth_8 + 2);
-        }
-        break;
-      case 4:
-        L_A = _mm256_setzero_si256();
-        L_B = _mm256_setzero_si256();
-        L_C = _mm256_setzero_si256();
-        L_D = _mm256_setzero_si256();
-        L_E = _mm256_setzero_si256();
-        L_F = _mm256_setzero_si256();
-        L_G = _mm256_setzero_si256();
-        L_H = _mm256_setzero_si256();
-        for (Index m = 0; m < rows - rows_32; m++) {
-          ptr = (QInt8*) &L_A;
-          ptr[m] = lhs(rows_32 + m, depth_8);
-          ptr = (QInt8*) &L_B;
-          ptr[m] = lhs(rows_32 + m, depth_8 + 1);
-          ptr = (QInt8*) &L_C;
-          ptr[m] = lhs(rows_32 + m, depth_8 + 2);
-          ptr = (QInt8*) &L_D;
-          ptr[m] = lhs(rows_32 + m, depth_8 + 3);
-        }
-        break;
-      case 5:
-        L_A = _mm256_setzero_si256();
-        L_B = _mm256_setzero_si256();
-        L_C = _mm256_setzero_si256();
-        L_D = _mm256_setzero_si256();
-        L_E = _mm256_setzero_si256();
-        L_F = _mm256_setzero_si256();
-        L_G = _mm256_setzero_si256();
-        L_H = _mm256_setzero_si256();
-        for (Index m = 0; m < rows - rows_32; m++) {
-          ptr = (QInt8*) &L_A;
-          ptr[m] = lhs(rows_32 + m, depth_8);
-          ptr = (QInt8*) &L_B;
-          ptr[m] = lhs(rows_32 + m, depth_8 + 1);
-          ptr = (QInt8*) &L_C;
-          ptr[m] = lhs(rows_32 + m, depth_8 + 2);
-          ptr = (QInt8*) &L_D;
-          ptr[m] = lhs(rows_32 + m, depth_8 + 3);
-          ptr = (QInt8*) &L_E;
-          ptr[m] = lhs(rows_32 + m, depth_8 + 4);
-        }
-        break;
-      case 6:
-        L_A = _mm256_setzero_si256();
-        L_B = _mm256_setzero_si256();
-        L_C = _mm256_setzero_si256();
-        L_D = _mm256_setzero_si256();
-        L_E = _mm256_setzero_si256();
-        L_F = _mm256_setzero_si256();
-        L_G = _mm256_setzero_si256();
-        L_H = _mm256_setzero_si256();
-        for (Index m = 0; m < rows - rows_32; m++) {
-          ptr = (QInt8*) &L_A;
-          ptr[m] = lhs(rows_32 + m, depth_8);
-          ptr = (QInt8*) &L_B;
-          ptr[m] = lhs(rows_32 + m, depth_8 + 1);
-          ptr = (QInt8*) &L_C;
-          ptr[m] = lhs(rows_32 + m, depth_8 + 2);
-          ptr = (QInt8*) &L_D;
-          ptr[m] = lhs(rows_32 + m, depth_8 + 3);
-          ptr = (QInt8*) &L_E;
-          ptr[m] = lhs(rows_32 + m, depth_8 + 4);
-          ptr = (QInt8*) &L_F;
-          ptr[m] = lhs(rows_32 + m, depth_8 + 5);
-        }
-        break;
-      case 7:
-        L_A = _mm256_setzero_si256();
-        L_B = _mm256_setzero_si256();
-        L_C = _mm256_setzero_si256();
-        L_D = _mm256_setzero_si256();
-        L_E = _mm256_setzero_si256();
-        L_F = _mm256_setzero_si256();
-        L_G = _mm256_setzero_si256();
-        L_H = _mm256_setzero_si256();
-        for (Index m = 0; m < rows - rows_32; m++) {
-          ptr = (QInt8*) &L_A;
-          ptr[m] = lhs(rows_32 + m, depth_8);
-          ptr = (QInt8*) &L_B;
-          ptr[m] = lhs(rows_32 + m, depth_8 + 1);
-          ptr = (QInt8*) &L_C;
-          ptr[m] = lhs(rows_32 + m, depth_8 + 2);
-          ptr = (QInt8*) &L_D;
-          ptr[m] = lhs(rows_32 + m, depth_8 + 3);
-          ptr = (QInt8*) &L_E;
-          ptr[m] = lhs(rows_32 + m, depth_8 + 4);
-          ptr = (QInt8*) &L_F;
-          ptr[m] = lhs(rows_32 + m, depth_8 + 5);
-          ptr = (QInt8*) &L_G;
-          ptr[m] = lhs(rows_32 + m, depth_8 + 6);
-        }
-        break;
+        case 1:
+          L_A = _mm256_setzero_si256();
+          L_B = _mm256_setzero_si256();
+          L_C = _mm256_setzero_si256();
+          L_D = _mm256_setzero_si256();
+          L_E = _mm256_setzero_si256();
+          L_F = _mm256_setzero_si256();
+          L_G = _mm256_setzero_si256();
+          L_H = _mm256_setzero_si256();
+          for (Index m = 0; m < rows - rows_32; m++) {
+            QInt8* ptr = (QInt8*)&L_A;
+            ptr[m] = lhs(rows_32 + m, depth_8);
+          }
+          break;
+        case 2:
+          L_A = _mm256_setzero_si256();
+          L_B = _mm256_setzero_si256();
+          L_C = _mm256_setzero_si256();
+          L_D = _mm256_setzero_si256();
+          L_E = _mm256_setzero_si256();
+          L_F = _mm256_setzero_si256();
+          L_G = _mm256_setzero_si256();
+          L_H = _mm256_setzero_si256();
+          for (Index m = 0; m < rows - rows_32; m++) {
+            ptr = (QInt8*)&L_A;
+            ptr[m] = lhs(rows_32 + m, depth_8);
+            ptr = (QInt8*)&L_B;
+            ptr[m] = lhs(rows_32 + m, depth_8 + 1);
+          }
+          break;
+        case 3:
+          L_A = _mm256_setzero_si256();
+          L_B = _mm256_setzero_si256();
+          L_C = _mm256_setzero_si256();
+          L_D = _mm256_setzero_si256();
+          L_E = _mm256_setzero_si256();
+          L_F = _mm256_setzero_si256();
+          L_G = _mm256_setzero_si256();
+          L_H = _mm256_setzero_si256();
+          for (Index m = 0; m < rows - rows_32; m++) {
+            ptr = (QInt8*)&L_A;
+            ptr[m] = lhs(rows_32 + m, depth_8);
+            ptr = (QInt8*)&L_B;
+            ptr[m] = lhs(rows_32 + m, depth_8 + 1);
+            ptr = (QInt8*)&L_C;
+            ptr[m] = lhs(rows_32 + m, depth_8 + 2);
+          }
+          break;
+        case 4:
+          L_A = _mm256_setzero_si256();
+          L_B = _mm256_setzero_si256();
+          L_C = _mm256_setzero_si256();
+          L_D = _mm256_setzero_si256();
+          L_E = _mm256_setzero_si256();
+          L_F = _mm256_setzero_si256();
+          L_G = _mm256_setzero_si256();
+          L_H = _mm256_setzero_si256();
+          for (Index m = 0; m < rows - rows_32; m++) {
+            ptr = (QInt8*)&L_A;
+            ptr[m] = lhs(rows_32 + m, depth_8);
+            ptr = (QInt8*)&L_B;
+            ptr[m] = lhs(rows_32 + m, depth_8 + 1);
+            ptr = (QInt8*)&L_C;
+            ptr[m] = lhs(rows_32 + m, depth_8 + 2);
+            ptr = (QInt8*)&L_D;
+            ptr[m] = lhs(rows_32 + m, depth_8 + 3);
+          }
+          break;
+        case 5:
+          L_A = _mm256_setzero_si256();
+          L_B = _mm256_setzero_si256();
+          L_C = _mm256_setzero_si256();
+          L_D = _mm256_setzero_si256();
+          L_E = _mm256_setzero_si256();
+          L_F = _mm256_setzero_si256();
+          L_G = _mm256_setzero_si256();
+          L_H = _mm256_setzero_si256();
+          for (Index m = 0; m < rows - rows_32; m++) {
+            ptr = (QInt8*)&L_A;
+            ptr[m] = lhs(rows_32 + m, depth_8);
+            ptr = (QInt8*)&L_B;
+            ptr[m] = lhs(rows_32 + m, depth_8 + 1);
+            ptr = (QInt8*)&L_C;
+            ptr[m] = lhs(rows_32 + m, depth_8 + 2);
+            ptr = (QInt8*)&L_D;
+            ptr[m] = lhs(rows_32 + m, depth_8 + 3);
+            ptr = (QInt8*)&L_E;
+            ptr[m] = lhs(rows_32 + m, depth_8 + 4);
+          }
+          break;
+        case 6:
+          L_A = _mm256_setzero_si256();
+          L_B = _mm256_setzero_si256();
+          L_C = _mm256_setzero_si256();
+          L_D = _mm256_setzero_si256();
+          L_E = _mm256_setzero_si256();
+          L_F = _mm256_setzero_si256();
+          L_G = _mm256_setzero_si256();
+          L_H = _mm256_setzero_si256();
+          for (Index m = 0; m < rows - rows_32; m++) {
+            ptr = (QInt8*)&L_A;
+            ptr[m] = lhs(rows_32 + m, depth_8);
+            ptr = (QInt8*)&L_B;
+            ptr[m] = lhs(rows_32 + m, depth_8 + 1);
+            ptr = (QInt8*)&L_C;
+            ptr[m] = lhs(rows_32 + m, depth_8 + 2);
+            ptr = (QInt8*)&L_D;
+            ptr[m] = lhs(rows_32 + m, depth_8 + 3);
+            ptr = (QInt8*)&L_E;
+            ptr[m] = lhs(rows_32 + m, depth_8 + 4);
+            ptr = (QInt8*)&L_F;
+            ptr[m] = lhs(rows_32 + m, depth_8 + 5);
+          }
+          break;
+        case 7:
+          L_A = _mm256_setzero_si256();
+          L_B = _mm256_setzero_si256();
+          L_C = _mm256_setzero_si256();
+          L_D = _mm256_setzero_si256();
+          L_E = _mm256_setzero_si256();
+          L_F = _mm256_setzero_si256();
+          L_G = _mm256_setzero_si256();
+          L_H = _mm256_setzero_si256();
+          for (Index m = 0; m < rows - rows_32; m++) {
+            ptr = (QInt8*)&L_A;
+            ptr[m] = lhs(rows_32 + m, depth_8);
+            ptr = (QInt8*)&L_B;
+            ptr[m] = lhs(rows_32 + m, depth_8 + 1);
+            ptr = (QInt8*)&L_C;
+            ptr[m] = lhs(rows_32 + m, depth_8 + 2);
+            ptr = (QInt8*)&L_D;
+            ptr[m] = lhs(rows_32 + m, depth_8 + 3);
+            ptr = (QInt8*)&L_E;
+            ptr[m] = lhs(rows_32 + m, depth_8 + 4);
+            ptr = (QInt8*)&L_F;
+            ptr[m] = lhs(rows_32 + m, depth_8 + 5);
+            ptr = (QInt8*)&L_G;
+            ptr[m] = lhs(rows_32 + m, depth_8 + 6);
+          }
+          break;
       }
 
       // Interleave 8-bit elements
@@ -1124,12 +1159,17 @@ operator()(QInt8* blockA, const DataMapper& lhs, Index depth, Index rows, Index
   }
 }
 
-template <typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
-EIGEN_DONT_INLINE void gemm_pack_rhs_any<QUInt8, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>::
-operator()(QUInt8* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset) {
+template <typename Index, typename DataMapper, int nr, bool Conjugate,
+          bool PanelMode>
+EIGEN_DONT_INLINE void gemm_pack_rhs_any<QUInt8, Index, DataMapper, nr,
+                                         ColMajor, Conjugate, PanelMode>::
+operator()(QUInt8* blockB, const DataMapper& rhs, Index depth, Index cols,
+           Index stride, Index offset) {
   eigen_assert(stride == 0);
   eigen_assert(offset == 0);
 
+  typedef typename packet_traits<QUInt8>::type Packet;
+
   // Get vector pointer
   __m256i* blockB_256 = reinterpret_cast<__m256i*>(blockB);
 
@@ -1158,52 +1198,52 @@ operator()(QUInt8* blockB, const DataMapper& rhs, Index depth, Index cols, Index
   for (Index n = 0; n < cols_32; n += 32) {
     // Pack depth in sets of 32
     for (Index k = 0; k < depth_32; k += 32) {
-      __m256i R_A = rhs.loadPacket(k, n);
-      __m256i R_B = rhs.loadPacket(k, n + 1);
-      __m256i R_C = rhs.loadPacket(k, n + 2);
-      __m256i R_D = rhs.loadPacket(k, n + 3);
+      __m256i R_A = rhs.template loadPacket<Packet>(k, n);
+      __m256i R_B = rhs.template loadPacket<Packet>(k, n + 1);
+      __m256i R_C = rhs.template loadPacket<Packet>(k, n + 2);
+      __m256i R_D = rhs.template loadPacket<Packet>(k, n + 3);
       PACK_STEP;
 
-      R_A = rhs.loadPacket(k, n + 4);
-      R_B = rhs.loadPacket(k, n + 5);
-      R_C = rhs.loadPacket(k, n + 6);
-      R_D = rhs.loadPacket(k, n + 7);
+      R_A = rhs.template loadPacket<Packet>(k, n + 4);
+      R_B = rhs.template loadPacket<Packet>(k, n + 5);
+      R_C = rhs.template loadPacket<Packet>(k, n + 6);
+      R_D = rhs.template loadPacket<Packet>(k, n + 7);
       PACK_STEP;
 
-      R_A = rhs.loadPacket(k, n + 8);
-      R_B = rhs.loadPacket(k, n + 9);
-      R_C = rhs.loadPacket(k, n + 10);
-      R_D = rhs.loadPacket(k, n + 11);
+      R_A = rhs.template loadPacket<Packet>(k, n + 8);
+      R_B = rhs.template loadPacket<Packet>(k, n + 9);
+      R_C = rhs.template loadPacket<Packet>(k, n + 10);
+      R_D = rhs.template loadPacket<Packet>(k, n + 11);
       PACK_STEP;
 
-      R_A = rhs.loadPacket(k, n + 12);
-      R_B = rhs.loadPacket(k, n + 13);
-      R_C = rhs.loadPacket(k, n + 14);
-      R_D = rhs.loadPacket(k, n + 15);
+      R_A = rhs.template loadPacket<Packet>(k, n + 12);
+      R_B = rhs.template loadPacket<Packet>(k, n + 13);
+      R_C = rhs.template loadPacket<Packet>(k, n + 14);
+      R_D = rhs.template loadPacket<Packet>(k, n + 15);
       PACK_STEP;
 
-      R_A = rhs.loadPacket(k, n + 16);
-      R_B = rhs.loadPacket(k, n + 17);
-      R_C = rhs.loadPacket(k, n + 18);
-      R_D = rhs.loadPacket(k, n + 19);
+      R_A = rhs.template loadPacket<Packet>(k, n + 16);
+      R_B = rhs.template loadPacket<Packet>(k, n + 17);
+      R_C = rhs.template loadPacket<Packet>(k, n + 18);
+      R_D = rhs.template loadPacket<Packet>(k, n + 19);
       PACK_STEP;
 
-      R_A = rhs.loadPacket(k, n + 20);
-      R_B = rhs.loadPacket(k, n + 21);
-      R_C = rhs.loadPacket(k, n + 22);
-      R_D = rhs.loadPacket(k, n + 23);
+      R_A = rhs.template loadPacket<Packet>(k, n + 20);
+      R_B = rhs.template loadPacket<Packet>(k, n + 21);
+      R_C = rhs.template loadPacket<Packet>(k, n + 22);
+      R_D = rhs.template loadPacket<Packet>(k, n + 23);
       PACK_STEP;
 
-      R_A = rhs.loadPacket(k, n + 24);
-      R_B = rhs.loadPacket(k, n + 25);
-      R_C = rhs.loadPacket(k, n + 26);
-      R_D = rhs.loadPacket(k, n + 27);
+      R_A = rhs.template loadPacket<Packet>(k, n + 24);
+      R_B = rhs.template loadPacket<Packet>(k, n + 25);
+      R_C = rhs.template loadPacket<Packet>(k, n + 26);
+      R_D = rhs.template loadPacket<Packet>(k, n + 27);
       PACK_STEP;
 
-      R_A = rhs.loadPacket(k, n + 28);
-      R_B = rhs.loadPacket(k, n + 29);
-      R_C = rhs.loadPacket(k, n + 30);
-      R_D = rhs.loadPacket(k, n + 31);
+      R_A = rhs.template loadPacket<Packet>(k, n + 28);
+      R_B = rhs.template loadPacket<Packet>(k, n + 29);
+      R_C = rhs.template loadPacket<Packet>(k, n + 30);
+      R_D = rhs.template loadPacket<Packet>(k, n + 31);
       PACK_STEP;
 
       blockB_256 += 24;
@@ -1216,13 +1256,13 @@ operator()(QUInt8* blockB, const DataMapper& rhs, Index depth, Index cols, Index
       __m256i R_C = _mm256_setzero_si256();
       __m256i R_D = _mm256_setzero_si256();
       for (Index k = depth_32; k < depth; k++) {
-        ptr = (QUInt8*) &R_A;
+        ptr = (QUInt8*)&R_A;
         ptr[k - depth_32] = rhs(k, n);
-        ptr = (QUInt8*) &R_B;
+        ptr = (QUInt8*)&R_B;
         ptr[k - depth_32] = rhs(k, n + 1);
-        ptr = (QUInt8*) &R_C;
+        ptr = (QUInt8*)&R_C;
         ptr[k - depth_32] = rhs(k, n + 2);
-        ptr = (QUInt8*) &R_D;
+        ptr = (QUInt8*)&R_D;
         ptr[k - depth_32] = rhs(k, n + 3);
       }
       PACK_STEP;
@@ -1232,13 +1272,13 @@ operator()(QUInt8* blockB, const DataMapper& rhs, Index depth, Index cols, Index
       R_C = _mm256_setzero_si256();
       R_D = _mm256_setzero_si256();
       for (Index k = depth_32; k < depth; k++) {
-        ptr = (QUInt8*) &R_A;
+        ptr = (QUInt8*)&R_A;
         ptr[k - depth_32] = rhs(k, n + 4);
-        ptr = (QUInt8*) &R_B;
+        ptr = (QUInt8*)&R_B;
         ptr[k - depth_32] = rhs(k, n + 5);
-        ptr = (QUInt8*) &R_C;
+        ptr = (QUInt8*)&R_C;
         ptr[k - depth_32] = rhs(k, n + 6);
-        ptr = (QUInt8*) &R_D;
+        ptr = (QUInt8*)&R_D;
         ptr[k - depth_32] = rhs(k, n + 7);
       }
       PACK_STEP;
@@ -1248,13 +1288,13 @@ operator()(QUInt8* blockB, const DataMapper& rhs, Index depth, Index cols, Index
       R_C = _mm256_setzero_si256();
       R_D = _mm256_setzero_si256();
       for (Index k = depth_32; k < depth; k++) {
-        ptr = (QUInt8*) &R_A;
+        ptr = (QUInt8*)&R_A;
         ptr[k - depth_32] = rhs(k, n + 8);
-        ptr = (QUInt8*) &R_B;
+        ptr = (QUInt8*)&R_B;
         ptr[k - depth_32] = rhs(k, n + 9);
-        ptr = (QUInt8*) &R_C;
+        ptr = (QUInt8*)&R_C;
         ptr[k - depth_32] = rhs(k, n + 10);
-        ptr = (QUInt8*) &R_D;
+        ptr = (QUInt8*)&R_D;
         ptr[k - depth_32] = rhs(k, n + 11);
       }
       PACK_STEP;
@@ -1264,13 +1304,13 @@ operator()(QUInt8* blockB, const DataMapper& rhs, Index depth, Index cols, Index
       R_C = _mm256_setzero_si256();
       R_D = _mm256_setzero_si256();
       for (Index k = depth_32; k < depth; k++) {
-        ptr = (QUInt8*) &R_A;
+        ptr = (QUInt8*)&R_A;
         ptr[k - depth_32] = rhs(k, n + 12);
-        ptr = (QUInt8*) &R_B;
+        ptr = (QUInt8*)&R_B;
         ptr[k - depth_32] = rhs(k, n + 13);
-        ptr = (QUInt8*) &R_C;
+        ptr = (QUInt8*)&R_C;
         ptr[k - depth_32] = rhs(k, n + 14);
-        ptr = (QUInt8*) &R_D;
+        ptr = (QUInt8*)&R_D;
         ptr[k - depth_32] = rhs(k, n + 15);
       }
       PACK_STEP;
@@ -1280,13 +1320,13 @@ operator()(QUInt8* blockB, const DataMapper& rhs, Index depth, Index cols, Index
       R_C = _mm256_setzero_si256();
       R_D = _mm256_setzero_si256();
       for (Index k = depth_32; k < depth; k++) {
-        ptr = (QUInt8*) &R_A;
+        ptr = (QUInt8*)&R_A;
         ptr[k - depth_32] = rhs(k, n + 16);
-        ptr = (QUInt8*) &R_B;
+        ptr = (QUInt8*)&R_B;
         ptr[k - depth_32] = rhs(k, n + 17);
-        ptr = (QUInt8*) &R_C;
+        ptr = (QUInt8*)&R_C;
         ptr[k - depth_32] = rhs(k, n + 18);
-        ptr = (QUInt8*) &R_D;
+        ptr = (QUInt8*)&R_D;
         ptr[k - depth_32] = rhs(k, n + 19);
       }
       PACK_STEP;
@@ -1296,13 +1336,13 @@ operator()(QUInt8* blockB, const DataMapper& rhs, Index depth, Index cols, Index
       R_C = _mm256_setzero_si256();
       R_D = _mm256_setzero_si256();
       for (Index k = depth_32; k < depth; k++) {
-        ptr = (QUInt8*) &R_A;
+        ptr = (QUInt8*)&R_A;
         ptr[k - depth_32] = rhs(k, n + 20);
-        ptr = (QUInt8*) &R_B;
+        ptr = (QUInt8*)&R_B;
         ptr[k - depth_32] = rhs(k, n + 21);
-        ptr = (QUInt8*) &R_C;
+        ptr = (QUInt8*)&R_C;
         ptr[k - depth_32] = rhs(k, n + 22);
-        ptr = (QUInt8*) &R_D;
+        ptr = (QUInt8*)&R_D;
         ptr[k - depth_32] = rhs(k, n + 23);
       }
       PACK_STEP;
@@ -1312,13 +1352,13 @@ operator()(QUInt8* blockB, const DataMapper& rhs, Index depth, Index cols, Index
       R_C = _mm256_setzero_si256();
       R_D = _mm256_setzero_si256();
       for (Index k = depth_32; k < depth; k++) {
-        ptr = (QUInt8*) &R_A;
+        ptr = (QUInt8*)&R_A;
         ptr[k - depth_32] = rhs(k, n + 24);
-        ptr = (QUInt8*) &R_B;
+        ptr = (QUInt8*)&R_B;
         ptr[k - depth_32] = rhs(k, n + 25);
-        ptr = (QUInt8*) &R_C;
+        ptr = (QUInt8*)&R_C;
         ptr[k - depth_32] = rhs(k, n + 26);
-        ptr = (QUInt8*) &R_D;
+        ptr = (QUInt8*)&R_D;
         ptr[k - depth_32] = rhs(k, n + 27);
       }
       PACK_STEP;
@@ -1328,13 +1368,13 @@ operator()(QUInt8* blockB, const DataMapper& rhs, Index depth, Index cols, Index
       R_C = _mm256_setzero_si256();
       R_D = _mm256_setzero_si256();
       for (Index k = depth_32; k < depth; k++) {
-        ptr = (QUInt8*) &R_A;
+        ptr = (QUInt8*)&R_A;
         ptr[k - depth_32] = rhs(k, n + 28);
-        ptr = (QUInt8*) &R_B;
+        ptr = (QUInt8*)&R_B;
         ptr[k - depth_32] = rhs(k, n + 29);
-        ptr = (QUInt8*) &R_C;
+        ptr = (QUInt8*)&R_C;
         ptr[k - depth_32] = rhs(k, n + 30);
-        ptr = (QUInt8*) &R_D;
+        ptr = (QUInt8*)&R_D;
         ptr[k - depth_32] = rhs(k, n + 31);
       }
       PACK_STEP;
@@ -1350,34 +1390,34 @@ operator()(QUInt8* blockB, const DataMapper& rhs, Index depth, Index cols, Index
       Index n;
       for (n = cols_32; n < cols; n += 4) {
         switch (cols - n) {
-        case 1:
-          R_A = rhs.loadPacket(k, n);
-          R_B = _mm256_setzero_si256();
-          R_C = _mm256_setzero_si256();
-          R_D = _mm256_setzero_si256();
-          PACK_STEP;
-          break;
-        case 2:
-          R_A = rhs.loadPacket(k, n);
-          R_B = rhs.loadPacket(k, n + 1);
-          R_C = _mm256_setzero_si256();
-          R_D = _mm256_setzero_si256();
-          PACK_STEP;
-          break;
-        case 3:
-          R_A = rhs.loadPacket(k, n);
-          R_B = rhs.loadPacket(k, n + 1);
-          R_C = rhs.loadPacket(k, n + 2);
-          R_D = _mm256_setzero_si256();
-          PACK_STEP;
-          break;
-        default:
-          R_A = rhs.loadPacket(k, n);
-          R_B = rhs.loadPacket(k, n + 1);
-          R_C = rhs.loadPacket(k, n + 2);
-          R_D = rhs.loadPacket(k, n + 3);
-          PACK_STEP;
-          break;
+          case 1:
+            R_A = rhs.template loadPacket<Packet>(k, n);
+            R_B = _mm256_setzero_si256();
+            R_C = _mm256_setzero_si256();
+            R_D = _mm256_setzero_si256();
+            PACK_STEP;
+            break;
+          case 2:
+            R_A = rhs.template loadPacket<Packet>(k, n);
+            R_B = rhs.template loadPacket<Packet>(k, n + 1);
+            R_C = _mm256_setzero_si256();
+            R_D = _mm256_setzero_si256();
+            PACK_STEP;
+            break;
+          case 3:
+            R_A = rhs.template loadPacket<Packet>(k, n);
+            R_B = rhs.template loadPacket<Packet>(k, n + 1);
+            R_C = rhs.template loadPacket<Packet>(k, n + 2);
+            R_D = _mm256_setzero_si256();
+            PACK_STEP;
+            break;
+          default:
+            R_A = rhs.template loadPacket<Packet>(k, n);
+            R_B = rhs.template loadPacket<Packet>(k, n + 1);
+            R_C = rhs.template loadPacket<Packet>(k, n + 2);
+            R_D = rhs.template loadPacket<Packet>(k, n + 3);
+            PACK_STEP;
+            break;
         }
       }
 
@@ -1394,46 +1434,46 @@ operator()(QUInt8* blockB, const DataMapper& rhs, Index depth, Index cols, Index
         __m256i R_C = _mm256_setzero_si256();
         __m256i R_D = _mm256_setzero_si256();
         switch (cols - n) {
-        case 1:
-          for (Index k = depth_32; k < depth; k++) {
-            ptr = (QUInt8*) &R_A;
-            ptr[k - depth_32] = rhs(k, n);
-          }
-          PACK_STEP;
-          break;
-        case 2:
-          for (Index k = depth_32; k < depth; k++) {
-            ptr = (QUInt8*) &R_A;
-            ptr[k - depth_32] = rhs(k, n);
-            ptr = (QUInt8*) &R_B;
-            ptr[k - depth_32] = rhs(k, n + 1);
-          }
-          PACK_STEP;
-          break;
-        case 3:
-          for (Index k = depth_32; k < depth; k++) {
-            ptr = (QUInt8*) &R_A;
-            ptr[k - depth_32] = rhs(k, n);
-            ptr = (QUInt8*) &R_B;
-            ptr[k - depth_32] = rhs(k, n + 1);
-            ptr = (QUInt8*) &R_C;
-            ptr[k - depth_32] = rhs(k, n + 2);
-          }
-          PACK_STEP;
-          break;
-        default:
-          for (Index k = depth_32; k < depth; k++) {
-            ptr = (QUInt8*) &R_A;
-            ptr[k - depth_32] = rhs(k, n);
-            ptr = (QUInt8*) &R_B;
-            ptr[k - depth_32] = rhs(k, n + 1);
-            ptr = (QUInt8*) &R_C;
-            ptr[k - depth_32] = rhs(k, n + 2);
-            ptr = (QUInt8*) &R_D;
-            ptr[k - depth_32] = rhs(k, n + 3);
-          }
-          PACK_STEP;
-          break;
+          case 1:
+            for (Index k = depth_32; k < depth; k++) {
+              ptr = (QUInt8*)&R_A;
+              ptr[k - depth_32] = rhs(k, n);
+            }
+            PACK_STEP;
+            break;
+          case 2:
+            for (Index k = depth_32; k < depth; k++) {
+              ptr = (QUInt8*)&R_A;
+              ptr[k - depth_32] = rhs(k, n);
+              ptr = (QUInt8*)&R_B;
+              ptr[k - depth_32] = rhs(k, n + 1);
+            }
+            PACK_STEP;
+            break;
+          case 3:
+            for (Index k = depth_32; k < depth; k++) {
+              ptr = (QUInt8*)&R_A;
+              ptr[k - depth_32] = rhs(k, n);
+              ptr = (QUInt8*)&R_B;
+              ptr[k - depth_32] = rhs(k, n + 1);
+              ptr = (QUInt8*)&R_C;
+              ptr[k - depth_32] = rhs(k, n + 2);
+            }
+            PACK_STEP;
+            break;
+          default:
+            for (Index k = depth_32; k < depth; k++) {
+              ptr = (QUInt8*)&R_A;
+              ptr[k - depth_32] = rhs(k, n);
+              ptr = (QUInt8*)&R_B;
+              ptr[k - depth_32] = rhs(k, n + 1);
+              ptr = (QUInt8*)&R_C;
+              ptr[k - depth_32] = rhs(k, n + 2);
+              ptr = (QUInt8*)&R_D;
+              ptr[k - depth_32] = rhs(k, n + 3);
+            }
+            PACK_STEP;
+            break;
         }
       }
     }
@@ -1441,13 +1481,13 @@ operator()(QUInt8* blockB, const DataMapper& rhs, Index depth, Index cols, Index
 #undef PACK_STEP
 }
 
-template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
-EIGEN_DONT_INLINE
-void gebp_kernel_any<QInt8, QUInt8, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
-::operator()(const DataMapper& res, const QInt8* blockA, const QUInt8* blockB,
-             Index rows, Index depth, Index cols, QInt32 alpha,
-             Index strideA, Index strideB, Index offsetA, Index offsetB)
-{
+template <typename Index, typename DataMapper, int mr, int nr,
+          bool ConjugateLhs, bool ConjugateRhs>
+EIGEN_DONT_INLINE void gebp_kernel_any<QInt8, QUInt8, Index, DataMapper, mr, nr,
+                                       ConjugateLhs, ConjugateRhs>::
+operator()(const DataMapper& res, const QInt8* blockA, const QUInt8* blockB,
+           Index rows, Index depth, Index cols, QInt32 alpha, Index strideA,
+           Index strideB, Index offsetA, Index offsetB) {
   EIGEN_STATIC_ASSERT(!ConjugateLhs, YOU_MADE_A_PROGRAMMING_MISTAKE);
   EIGEN_STATIC_ASSERT(!ConjugateRhs, YOU_MADE_A_PROGRAMMING_MISTAKE);
   eigen_assert(alpha.value == 1);
@@ -1678,17 +1718,21 @@ void gebp_kernel_any<QInt8, QUInt8, Index, DataMapper, mr, nr, ConjugateLhs, Con
           LinearMapper r1 = res.getLinearMapper(m + 8, j);
           LinearMapper r2 = res.getLinearMapper(m + 16, j);
           LinearMapper r3 = res.getLinearMapper(m + 24, j);
-          r0.storePacket(
-              0, _mm256_add_epi32(blockO_256[i++], r0.loadPacket(0)));
-          r1.storePacket(
-              0, _mm256_add_epi32(blockO_256[i++], r1.loadPacket(0)));
-          r2.storePacket(
-              0, _mm256_add_epi32(blockO_256[i++], r2.loadPacket(0)));
-          r3.storePacket(
-              0, _mm256_add_epi32(blockO_256[i++], r3.loadPacket(0)));
+          typedef typename packet_traits<QInt32>::type Packet;
+          r0.template storePacket<Packet>(
+              0, _mm256_add_epi32(blockO_256[i++],
+                                  r0.template loadPacket<Packet>(0)));
+          r1.template storePacket<Packet>(
+              0, _mm256_add_epi32(blockO_256[i++],
+                                  r1.template loadPacket<Packet>(0)));
+          r2.template storePacket<Packet>(
+              0, _mm256_add_epi32(blockO_256[i++],
+                                  r2.template loadPacket<Packet>(0)));
+          r3.template storePacket<Packet>(
+              0, _mm256_add_epi32(blockO_256[i++],
+                                  r3.template loadPacket<Packet>(0)));
         }
-      }
-      else {
+      } else {
         for (Index j = n; j < cols; j++) {
           for (Index i = m; i < rows; i++) {
             res(i, j) = blockO[(j - n) * 32 + (i - m)];
@@ -1745,7 +1789,7 @@ void gebp_kernel_any<QInt8, QUInt8, Index, DataMapper, mr, nr, ConjugateLhs, Con
 // madd both perform an adjacent addition in the kernel.
 template <typename Index, typename DataMapper, int Pack1, int Pack2,
           bool Conjugate, bool PanelMode>
-struct gemm_pack_lhs<QInt8, Index, DataMapper, Pack1, Pack2, ColMajor,
+struct gemm_pack_lhs<QInt8, Index, DataMapper, Pack1, Pack2, QInt8, ColMajor,
                      Conjugate, PanelMode> {
   EIGEN_DONT_INLINE void operator()(QInt8* blockA, const DataMapper& lhs,
                                     Index depth, Index rows, Index stride = 0,
@@ -1755,15 +1799,18 @@ struct gemm_pack_lhs<QInt8, Index, DataMapper, Pack1, Pack2, ColMajor,
 template <typename Index, typename DataMapper, int Pack1, int Pack2,
           bool Conjugate, bool PanelMode>
 EIGEN_DONT_INLINE void gemm_pack_lhs<QInt8, Index, DataMapper, Pack1, Pack2,
-                                     ColMajor, Conjugate, PanelMode>::
+                                     QInt8, ColMajor, Conjugate, PanelMode>::
 operator()(QInt8* blockA, const DataMapper& lhs, Index depth, Index rows,
            Index stride, Index offset) {
   eigen_assert(stride == 0);
   eigen_assert(offset == 0);
 
+  typedef typename packet_traits<QInt8>::type Packet;
+
   // Use alternate function for weird sizes
   if (rows % 32 != 0 || depth % 32 != 0) {
-    gemm_pack_lhs_any<QInt8, Index, DataMapper, Pack1, Pack2, ColMajor, Conjugate, PanelMode> lhs_pack;
+    gemm_pack_lhs_any<QInt8, Index, DataMapper, Pack1, Pack2, ColMajor,
+                      Conjugate, PanelMode> lhs_pack;
     return lhs_pack(blockA, lhs, depth, rows, stride, offset);
   }
 
@@ -1775,15 +1822,15 @@ operator()(QInt8* blockA, const DataMapper& lhs, Index depth, Index rows,
     // Pack depth in sets of 8
     for (Index k = 0; k < depth; k += 8) {
       // Load vectors
-      __m256i L_A = lhs.loadPacket(m, k);
-      __m256i L_B = lhs.loadPacket(m, k + 1);
+      __m256i L_A = lhs.template loadPacket<Packet>(m, k);
+      __m256i L_B = lhs.template loadPacket<Packet>(m, k + 1);
 
       // Interleave 8-bit elements
       __m256i L_AB0_AB16 = _mm256_unpacklo_epi8(L_A, L_B);
       __m256i L_AB8_AB24 = _mm256_unpackhi_epi8(L_A, L_B);
 
-      __m256i L_C = lhs.loadPacket(m, k + 2);
-      __m256i L_D = lhs.loadPacket(m, k + 3);
+      __m256i L_C = lhs.template loadPacket<Packet>(m, k + 2);
+      __m256i L_D = lhs.template loadPacket<Packet>(m, k + 3);
       __m256i L_CD0_CD16 = _mm256_unpacklo_epi8(L_C, L_D);
       __m256i L_CD8_CD24 = _mm256_unpackhi_epi8(L_C, L_D);
 
@@ -1804,12 +1851,12 @@ operator()(QInt8* blockA, const DataMapper& lhs, Index depth, Index rows,
       _mm256_store_si256(blockA_256++, L_AD16);
       __m256i L_AD24 = _mm256_permute2x128_si256(L_AD8_AD24, L_AD12_AD28, 0x31);
       _mm256_store_si256(blockA_256++, L_AD24);
-      __m256i L_E = lhs.loadPacket(m, k + 4);
-      __m256i L_F = lhs.loadPacket(m, k + 5);
+      __m256i L_E = lhs.template loadPacket<Packet>(m, k + 4);
+      __m256i L_F = lhs.template loadPacket<Packet>(m, k + 5);
       __m256i L_EF0_EF16 = _mm256_unpacklo_epi8(L_E, L_F);
       __m256i L_EF8_EF24 = _mm256_unpackhi_epi8(L_E, L_F);
-      __m256i L_G = lhs.loadPacket(m, k + 6);
-      __m256i L_H = lhs.loadPacket(m, k + 7);
+      __m256i L_G = lhs.template loadPacket<Packet>(m, k + 6);
+      __m256i L_H = lhs.template loadPacket<Packet>(m, k + 7);
       __m256i L_GH0_GH16 = _mm256_unpacklo_epi8(L_G, L_H);
       __m256i L_GH8_GH24 = _mm256_unpackhi_epi8(L_G, L_H);
       __m256i L_EH0_EH16 = _mm256_unpacklo_epi16(L_EF0_EF16, L_GH0_GH16);
@@ -1868,9 +1915,12 @@ operator()(QUInt8* blockB, const DataMapper& rhs, Index depth, Index cols,
   eigen_assert(stride == 0);
   eigen_assert(offset == 0);
 
+  typedef typename packet_traits<QUInt8>::type Packet;
+
   // Use alternate function for weird sizes
   if (cols % 32 != 0 || depth % 32 != 0) {
-    gemm_pack_rhs_any<QUInt8, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode> rhs_pack;
+    gemm_pack_rhs_any<QUInt8, Index, DataMapper, nr, ColMajor, Conjugate,
+                      PanelMode> rhs_pack;
     return rhs_pack(blockB, rhs, depth, cols, stride, offset);
   }
 
@@ -1898,52 +1948,52 @@ operator()(QUInt8* blockB, const DataMapper& rhs, Index depth, Index cols,
   for (Index n = 0; n < cols; n += 32) {
     // Pack depth in sets of 32
     for (Index k = 0; k < depth; k += 32) {
-      __m256i R_A = rhs.loadPacket(k, n);
-      __m256i R_B = rhs.loadPacket(k, n + 1);
-      __m256i R_C = rhs.loadPacket(k, n + 2);
-      __m256i R_D = rhs.loadPacket(k, n + 3);
+      __m256i R_A = rhs.template loadPacket<Packet>(k, n);
+      __m256i R_B = rhs.template loadPacket<Packet>(k, n + 1);
+      __m256i R_C = rhs.template loadPacket<Packet>(k, n + 2);
+      __m256i R_D = rhs.template loadPacket<Packet>(k, n + 3);
       PACK_STEP;
 
-      R_A = rhs.loadPacket(k, n + 4);
-      R_B = rhs.loadPacket(k, n + 5);
-      R_C = rhs.loadPacket(k, n + 6);
-      R_D = rhs.loadPacket(k, n + 7);
+      R_A = rhs.template loadPacket<Packet>(k, n + 4);
+      R_B = rhs.template loadPacket<Packet>(k, n + 5);
+      R_C = rhs.template loadPacket<Packet>(k, n + 6);
+      R_D = rhs.template loadPacket<Packet>(k, n + 7);
       PACK_STEP;
 
-      R_A = rhs.loadPacket(k, n + 8);
-      R_B = rhs.loadPacket(k, n + 9);
-      R_C = rhs.loadPacket(k, n + 10);
-      R_D = rhs.loadPacket(k, n + 11);
+      R_A = rhs.template loadPacket<Packet>(k, n + 8);
+      R_B = rhs.template loadPacket<Packet>(k, n + 9);
+      R_C = rhs.template loadPacket<Packet>(k, n + 10);
+      R_D = rhs.template loadPacket<Packet>(k, n + 11);
       PACK_STEP;
 
-      R_A = rhs.loadPacket(k, n + 12);
-      R_B = rhs.loadPacket(k, n + 13);
-      R_C = rhs.loadPacket(k, n + 14);
-      R_D = rhs.loadPacket(k, n + 15);
+      R_A = rhs.template loadPacket<Packet>(k, n + 12);
+      R_B = rhs.template loadPacket<Packet>(k, n + 13);
+      R_C = rhs.template loadPacket<Packet>(k, n + 14);
+      R_D = rhs.template loadPacket<Packet>(k, n + 15);
       PACK_STEP;
 
-      R_A = rhs.loadPacket(k, n + 16);
-      R_B = rhs.loadPacket(k, n + 17);
-      R_C = rhs.loadPacket(k, n + 18);
-      R_D = rhs.loadPacket(k, n + 19);
+      R_A = rhs.template loadPacket<Packet>(k, n + 16);
+      R_B = rhs.template loadPacket<Packet>(k, n + 17);
+      R_C = rhs.template loadPacket<Packet>(k, n + 18);
+      R_D = rhs.template loadPacket<Packet>(k, n + 19);
       PACK_STEP;
 
-      R_A = rhs.loadPacket(k, n + 20);
-      R_B = rhs.loadPacket(k, n + 21);
-      R_C = rhs.loadPacket(k, n + 22);
-      R_D = rhs.loadPacket(k, n + 23);
+      R_A = rhs.template loadPacket<Packet>(k, n + 20);
+      R_B = rhs.template loadPacket<Packet>(k, n + 21);
+      R_C = rhs.template loadPacket<Packet>(k, n + 22);
+      R_D = rhs.template loadPacket<Packet>(k, n + 23);
       PACK_STEP;
 
-      R_A = rhs.loadPacket(k, n + 24);
-      R_B = rhs.loadPacket(k, n + 25);
-      R_C = rhs.loadPacket(k, n + 26);
-      R_D = rhs.loadPacket(k, n + 27);
+      R_A = rhs.template loadPacket<Packet>(k, n + 24);
+      R_B = rhs.template loadPacket<Packet>(k, n + 25);
+      R_C = rhs.template loadPacket<Packet>(k, n + 26);
+      R_D = rhs.template loadPacket<Packet>(k, n + 27);
       PACK_STEP;
 
-      R_A = rhs.loadPacket(k, n + 28);
-      R_B = rhs.loadPacket(k, n + 29);
-      R_C = rhs.loadPacket(k, n + 30);
-      R_D = rhs.loadPacket(k, n + 31);
+      R_A = rhs.template loadPacket<Packet>(k, n + 28);
+      R_B = rhs.template loadPacket<Packet>(k, n + 29);
+      R_C = rhs.template loadPacket<Packet>(k, n + 30);
+      R_D = rhs.template loadPacket<Packet>(k, n + 31);
       PACK_STEP;
 
       blockB_256 += 24;
@@ -1953,24 +2003,26 @@ operator()(QUInt8* blockB, const DataMapper& rhs, Index depth, Index cols,
 }
 
 // Perform the actual multiplication on packed inputs
-template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
-struct gebp_kernel<QInt8, QUInt8, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
-{
+template <typename Index, typename DataMapper, int mr, int nr,
+          bool ConjugateLhs, bool ConjugateRhs>
+struct gebp_kernel<QInt8, QUInt8, Index, DataMapper, mr, nr, ConjugateLhs,
+                   ConjugateRhs> {
   typedef typename DataMapper::LinearMapper LinearMapper;
 
   EIGEN_DONT_INLINE
-  void operator()(const DataMapper& res, const QInt8* blockA, const QUInt8* blockB,
-                  Index rows, Index depth, Index cols, QInt32 alpha,
-                  Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
+  void operator()(const DataMapper& res, const QInt8* blockA,
+                  const QUInt8* blockB, Index rows, Index depth, Index cols,
+                  QInt32 alpha, Index strideA = -1, Index strideB = -1,
+                  Index offsetA = 0, Index offsetB = 0);
 };
 
-template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
-EIGEN_DONT_INLINE
-void gebp_kernel<QInt8, QUInt8, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
-::operator()(const DataMapper& res, const QInt8* blockA, const QUInt8* blockB,
-             Index rows, Index depth, Index cols, QInt32 alpha,
-             Index strideA, Index strideB, Index offsetA, Index offsetB)
-{
+template <typename Index, typename DataMapper, int mr, int nr,
+          bool ConjugateLhs, bool ConjugateRhs>
+EIGEN_DONT_INLINE void gebp_kernel<QInt8, QUInt8, Index, DataMapper, mr, nr,
+                                   ConjugateLhs, ConjugateRhs>::
+operator()(const DataMapper& res, const QInt8* blockA, const QUInt8* blockB,
+           Index rows, Index depth, Index cols, QInt32 alpha, Index strideA,
+           Index strideB, Index offsetA, Index offsetB) {
   EIGEN_STATIC_ASSERT(!ConjugateLhs, YOU_MADE_A_PROGRAMMING_MISTAKE);
   EIGEN_STATIC_ASSERT(!ConjugateRhs, YOU_MADE_A_PROGRAMMING_MISTAKE);
   eigen_assert(alpha.value == 1);
@@ -1986,8 +2038,10 @@ void gebp_kernel<QInt8, QUInt8, Index, DataMapper, mr, nr, ConjugateLhs, Conjuga
 
   // Use alternate function for weird sizes
   if (rows % 32 != 0 || cols % 32 != 0 || depth % 32 != 0) {
-    gebp_kernel_any<QInt8, QUInt8, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs> gebp;
-    return gebp(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
+    gebp_kernel_any<QInt8, QUInt8, Index, DataMapper, mr, nr, ConjugateLhs,
+                    ConjugateRhs> gebp;
+    return gebp(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB,
+                offsetA, offsetB);
   }
 
   // Create result block
@@ -2205,14 +2259,19 @@ void gebp_kernel<QInt8, QUInt8, Index, DataMapper, mr, nr, ConjugateLhs, Conjuga
         LinearMapper r1 = res.getLinearMapper(m + 8, j);
         LinearMapper r2 = res.getLinearMapper(m + 16, j);
         LinearMapper r3 = res.getLinearMapper(m + 24, j);
-        r0.storePacket(
-            0, _mm256_add_epi32(blockO_256[i++], r0.loadPacket(0)));
-        r1.storePacket(
-            0, _mm256_add_epi32(blockO_256[i++], r1.loadPacket(0)));
-        r2.storePacket(
-            0, _mm256_add_epi32(blockO_256[i++], r2.loadPacket(0)));
-        r3.storePacket(
-            0, _mm256_add_epi32(blockO_256[i++], r3.loadPacket(0)));
+        typedef typename packet_traits<QInt32>::type Packet;
+        r0.template storePacket<Packet>(
+            0, _mm256_add_epi32(blockO_256[i++],
+                                r0.template loadPacket<Packet>(0)));
+        r1.template storePacket<Packet>(
+            0, _mm256_add_epi32(blockO_256[i++],
+                                r1.template loadPacket<Packet>(0)));
+        r2.template storePacket<Packet>(
+            0, _mm256_add_epi32(blockO_256[i++],
+                                r2.template loadPacket<Packet>(0)));
+        r3.template storePacket<Packet>(
+            0, _mm256_add_epi32(blockO_256[i++],
+                                r3.template loadPacket<Packet>(0)));
       }
 
       // Zero the result block so it can be reused
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProductNEON.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProductNEON.h
index 9cd31570231..9e0efae6c9b 100644
--- a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProductNEON.h
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProductNEON.h
@@ -14,15 +14,14 @@
 namespace Eigen {
 namespace internal {
 
-
-// AVX2 optimized implementation of the case where the lhs is encoded using signed 8bit
+// AVX2 optimized implementation of the case where the lhs is encoded using
+// signed 8bit
 // integers and the rhs using unsigned 8bit integers.
 #ifdef EIGEN_USE_OPTIMIZED_INT8_UINT8_MAT_MAT_PRODUCT
 
-template<bool _ConjLhs, bool _ConjRhs>
-class gebp_traits<QInt8, QUInt8, _ConjLhs, _ConjRhs>
-{
-public:
+template <bool _ConjLhs, bool _ConjRhs>
+class gebp_traits<QInt8, QUInt8, _ConjLhs, _ConjRhs> {
+ public:
   typedef QInt8 LhsScalar;
   typedef QUInt8 RhsScalar;
   typedef QInt32 ResScalar;
@@ -40,22 +39,24 @@ public:
 };
 
 // Mat-Mat product of a signed 8bit lhs with an unsigned 8bit rhs
-template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
-struct gebp_kernel<QInt8, QUInt8, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
-{
+template <typename Index, typename DataMapper, int mr, int nr,
+          bool ConjugateLhs, bool ConjugateRhs>
+struct gebp_kernel<QInt8, QUInt8, Index, DataMapper, mr, nr, ConjugateLhs,
+                   ConjugateRhs> {
   EIGEN_DONT_INLINE
-  void operator()(const DataMapper& res, const QInt8* blockA, const QUInt8* blockB,
-                  Index rows, Index depth, Index cols, QInt32 alpha,
-                  Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
+  void operator()(const DataMapper& res, const QInt8* blockA,
+                  const QUInt8* blockB, Index rows, Index depth, Index cols,
+                  QInt32 alpha, Index strideA = -1, Index strideB = -1,
+                  Index offsetA = 0, Index offsetB = 0);
 };
 
-template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
-EIGEN_DONT_INLINE
-void gebp_kernel<QInt8, QUInt8, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
-::operator()(const DataMapper& res, const QInt8* blockA, const QUInt8* blockB,
-             Index rows, Index depth, Index cols, QInt32 alpha,
-             Index strideA, Index strideB, Index offsetA, Index offsetB)
-{
+template <typename Index, typename DataMapper, int mr, int nr,
+          bool ConjugateLhs, bool ConjugateRhs>
+EIGEN_DONT_INLINE void gebp_kernel<QInt8, QUInt8, Index, DataMapper, mr, nr,
+                                   ConjugateLhs, ConjugateRhs>::
+operator()(const DataMapper& res, const QInt8* blockA, const QUInt8* blockB,
+           Index rows, Index depth, Index cols, QInt32 alpha, Index strideA,
+           Index strideB, Index offsetA, Index offsetB) {
   EIGEN_STATIC_ASSERT(!ConjugateLhs, YOU_MADE_A_PROGRAMMING_MISTAKE);
   EIGEN_STATIC_ASSERT(!ConjugateRhs, YOU_MADE_A_PROGRAMMING_MISTAKE);
 
@@ -85,7 +86,6 @@ void gebp_kernel<QInt8, QUInt8, Index, DataMapper, mr, nr, ConjugateLhs, Conjuga
 }
 #endif
 
-
 }  // namespace internal
 }  // namespace Eigen
 
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatVecProduct.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatVecProduct.h
index ad11d3d44b8..f15200caba5 100644
--- a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatVecProduct.h
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatVecProduct.h
@@ -15,25 +15,23 @@ namespace internal {
 
 // Mat-Vec product
 // Both lhs and rhs are encoded as 8bit signed integers
-template<typename Index, typename LhsMapper, bool ConjugateLhs, typename RhsMapper, bool ConjugateRhs, int Version>
-struct general_matrix_vector_product<Index,QInt8,LhsMapper,ColMajor,ConjugateLhs,QInt8,RhsMapper,ConjugateRhs,Version>
-{
-EIGEN_DONT_INLINE static void run(
-  Index rows, Index cols,
-  const LhsMapper& lhs,
-  const RhsMapper& rhs,
-  QInt32* res, Index resIncr,
-  QInt8 alpha);
+template <typename Index, typename LhsMapper, bool ConjugateLhs,
+          typename RhsMapper, bool ConjugateRhs, int Version>
+struct general_matrix_vector_product<Index, QInt8, LhsMapper, ColMajor,
+                                     ConjugateLhs, QInt8, RhsMapper,
+                                     ConjugateRhs, Version> {
+  EIGEN_DONT_INLINE static void run(Index rows, Index cols,
+                                    const LhsMapper& lhs, const RhsMapper& rhs,
+                                    QInt32* res, Index resIncr, QInt8 alpha);
 };
 
-template<typename Index, typename LhsMapper, bool ConjugateLhs, typename RhsMapper, bool ConjugateRhs, int Version>
-EIGEN_DONT_INLINE void general_matrix_vector_product<Index,QInt8,LhsMapper,ColMajor,ConjugateLhs,QInt8,RhsMapper,ConjugateRhs,Version>::run(
-    Index rows, Index cols,
-    const LhsMapper& lhs,
-    const RhsMapper& rhs,
-    QInt32* res, Index resIncr,
-    QInt8 alpha)
-{
+template <typename Index, typename LhsMapper, bool ConjugateLhs,
+          typename RhsMapper, bool ConjugateRhs, int Version>
+EIGEN_DONT_INLINE void general_matrix_vector_product<
+    Index, QInt8, LhsMapper, ColMajor, ConjugateLhs, QInt8, RhsMapper,
+    ConjugateRhs, Version>::run(Index rows, Index cols, const LhsMapper& lhs,
+                                const RhsMapper& rhs, QInt32* res,
+                                Index resIncr, QInt8 alpha) {
   eigen_assert(alpha.value == 1);
   eigen_assert(resIncr == 1);
   eigen_assert(rows > 0);
@@ -78,26 +76,25 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<
 }
 
 // Mat-Vec product
-// The lhs is encoded using 8bit signed integers, the rhs using 8bit unsigned integers
-template<typename Index, typename LhsMapper, bool ConjugateLhs, typename RhsMapper, bool ConjugateRhs, int Version>
-struct general_matrix_vector_product<Index,QInt8,LhsMapper,ColMajor,ConjugateLhs,QUInt8,RhsMapper,ConjugateRhs,Version>
-{
-EIGEN_DONT_INLINE static void run(
-  Index rows, Index cols,
-  const LhsMapper& lhs,
-  const RhsMapper& rhs,
-  QInt32* res, Index resIncr,
-  QUInt8 alpha);
+// The lhs is encoded using 8bit signed integers, the rhs using 8bit unsigned
+// integers
+template <typename Index, typename LhsMapper, bool ConjugateLhs,
+          typename RhsMapper, bool ConjugateRhs, int Version>
+struct general_matrix_vector_product<Index, QInt8, LhsMapper, ColMajor,
+                                     ConjugateLhs, QUInt8, RhsMapper,
+                                     ConjugateRhs, Version> {
+  EIGEN_DONT_INLINE static void run(Index rows, Index cols,
+                                    const LhsMapper& lhs, const RhsMapper& rhs,
+                                    QInt32* res, Index resIncr, QUInt8 alpha);
 };
 
-template<typename Index, typename LhsMapper, bool ConjugateLhs, typename RhsMapper, bool ConjugateRhs, int Version>
-EIGEN_DONT_INLINE void general_matrix_vector_product<Index,QInt8,LhsMapper,ColMajor,ConjugateLhs,QUInt8,RhsMapper,ConjugateRhs,Version>::run(
-    Index rows, Index cols,
-    const LhsMapper& lhs,
-    const RhsMapper& rhs,
-    QInt32* res, Index resIncr,
-    QUInt8 alpha)
-{
+template <typename Index, typename LhsMapper, bool ConjugateLhs,
+          typename RhsMapper, bool ConjugateRhs, int Version>
+EIGEN_DONT_INLINE void general_matrix_vector_product<
+    Index, QInt8, LhsMapper, ColMajor, ConjugateLhs, QUInt8, RhsMapper,
+    ConjugateRhs, Version>::run(Index rows, Index cols, const LhsMapper& lhs,
+                                const RhsMapper& rhs, QInt32* res,
+                                Index resIncr, QUInt8 alpha) {
   eigen_assert(alpha.value == 1);
   eigen_assert(resIncr == 1);
   eigen_assert(rows > 0);
@@ -110,28 +107,26 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,QInt8,LhsMapper,ColMa
   }
 }
 
-
 // Mat-Vec product
-// The lhs is encoded using bit unsigned integers, the rhs using 8bit signed integers
-template<typename Index, typename LhsMapper, bool ConjugateLhs, typename RhsMapper, bool ConjugateRhs, int Version>
-struct general_matrix_vector_product<Index,QUInt8,LhsMapper,ColMajor,ConjugateLhs,QInt8,RhsMapper,ConjugateRhs,Version>
-{
-EIGEN_DONT_INLINE static void run(
-  Index rows, Index cols,
-  const LhsMapper& lhs,
-  const RhsMapper& rhs,
-  QInt32* res, Index resIncr,
-  QInt8 alpha);
+// The lhs is encoded using bit unsigned integers, the rhs using 8bit signed
+// integers
+template <typename Index, typename LhsMapper, bool ConjugateLhs,
+          typename RhsMapper, bool ConjugateRhs, int Version>
+struct general_matrix_vector_product<Index, QUInt8, LhsMapper, ColMajor,
+                                     ConjugateLhs, QInt8, RhsMapper,
+                                     ConjugateRhs, Version> {
+  EIGEN_DONT_INLINE static void run(Index rows, Index cols,
+                                    const LhsMapper& lhs, const RhsMapper& rhs,
+                                    QInt32* res, Index resIncr, QInt8 alpha);
 };
 
-template<typename Index, typename LhsMapper, bool ConjugateLhs, typename RhsMapper, bool ConjugateRhs, int Version>
-EIGEN_DONT_INLINE void general_matrix_vector_product<Index,QUInt8,LhsMapper,ColMajor,ConjugateLhs,QInt8,RhsMapper,ConjugateRhs,Version>::run(
-    Index rows, Index cols,
-    const LhsMapper& lhs,
-    const RhsMapper& rhs,
-    QInt32* res, Index resIncr,
-    QInt8 alpha)
-{
+template <typename Index, typename LhsMapper, bool ConjugateLhs,
+          typename RhsMapper, bool ConjugateRhs, int Version>
+EIGEN_DONT_INLINE void general_matrix_vector_product<
+    Index, QUInt8, LhsMapper, ColMajor, ConjugateLhs, QInt8, RhsMapper,
+    ConjugateRhs, Version>::run(Index rows, Index cols, const LhsMapper& lhs,
+                                const RhsMapper& rhs, QInt32* res,
+                                Index resIncr, QInt8 alpha) {
   eigen_assert(alpha.value == 1);
   eigen_assert(resIncr == 1);
   eigen_assert(rows > 0);
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX2.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX2.h
index 3abd4ee49c2..223ea4d58bf 100644
--- a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX2.h
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX2.h
@@ -8,24 +8,20 @@
 
 #endif
 
-inline int _mm256_extract_epi16_N0(const __m256i X)
-{
-	return _mm_extract_epi16(_mm256_extractf128_si256(X, 0 >> 3), 0 % 8);
+inline int _mm256_extract_epi16_N0(const __m256i X) {
+  return _mm_extract_epi16(_mm256_extractf128_si256(X, 0 >> 3), 0 % 8);
 }
 
-inline int _mm256_extract_epi16_N1(const __m256i X)
-{
-	return _mm_extract_epi16(_mm256_extractf128_si256(X, 1 >> 3), 1 % 8);
+inline int _mm256_extract_epi16_N1(const __m256i X) {
+  return _mm_extract_epi16(_mm256_extractf128_si256(X, 1 >> 3), 1 % 8);
 }
 
-inline int _mm256_extract_epi8_N0(const __m256i X)
-{
-	return _mm_extract_epi8(_mm256_extractf128_si256((X), 0 >> 4), 0 % 16);
+inline int _mm256_extract_epi8_N0(const __m256i X) {
+  return _mm_extract_epi8(_mm256_extractf128_si256((X), 0 >> 4), 0 % 16);
 }
 
-inline int _mm256_extract_epi8_N1(const __m256i X)
-{
-	return _mm_extract_epi8(_mm256_extractf128_si256((X), 1 >> 4), 1 % 16);
+inline int _mm256_extract_epi8_N1(const __m256i X) {
+  return _mm_extract_epi8(_mm256_extractf128_si256((X), 1 >> 4), 1 % 16);
 }
 
 namespace Eigen {
@@ -34,56 +30,56 @@ namespace internal {
 typedef struct Packet32q8i {
   __m256i val;
   operator __m256i() const { return val; }
-  Packet32q8i();
+  Packet32q8i() : val(_mm256_setzero_si256()){};
   Packet32q8i(__m256i val) : val(val) {}
 } Packet32q8i;
 
 typedef struct Packet16q16i {
   __m256i val;
   operator __m256i() const { return val; }
-  Packet16q16i();
+  Packet16q16i() : val(_mm256_setzero_si256()){};
   Packet16q16i(__m256i val) : val(val) {}
 } Packet16q16i;
 
 typedef struct Packet32q8u {
   __m256i val;
   operator __m256i() const { return val; }
-  Packet32q8u();
+  Packet32q8u() : val(_mm256_setzero_si256()){};
   Packet32q8u(__m256i val) : val(val) {}
 } Packet32q8u;
 
 typedef struct Packet16q8i {
   __m128i val;
   operator __m128i() const { return val; }
-  Packet16q8i();
+  Packet16q8i() : val(_mm_setzero_si128()) {}
   Packet16q8i(__m128i val) : val(val) {}
 } Packet16q8i;
 
 typedef struct Packet16q8u {
   __m128i val;
   operator __m128i() const { return val; }
-  Packet16q8u();
+  Packet16q8u() : val(_mm_setzero_si128()) {}
   Packet16q8u(__m128i val) : val(val) {}
 } Packet16q8u;
 
 typedef struct Packet8q16i {
   __m128i val;
   operator __m128i() const { return val; }
-  Packet8q16i();
+  Packet8q16i() : val(_mm_setzero_si128()) {}
   Packet8q16i(__m128i val) : val(val) {}
 } Packet8q16i;
 
 typedef struct Packet8q32i {
   __m256i val;
   operator __m256i() const { return val; }
-  Packet8q32i();
+  Packet8q32i() : val(_mm256_setzero_si256()){};
   Packet8q32i(__m256i val) : val(val) {}
 } Packet8q32i;
 
 typedef struct Packet4q32i {
   __m128i val;
   operator __m128i() const { return val; }
-  Packet4q32i();
+  Packet4q32i() : val(_mm_setzero_si128()) {}
   Packet4q32i(__m128i val) : val(val) {}
 } Packet4q32i;
 
@@ -182,25 +178,25 @@ template <>
 struct unpacket_traits<Packet32q8i> {
   typedef QInt8 type;
   typedef Packet16q8i half;
-  enum { size = 32, alignment=Aligned32 };
+  enum { size = 32, alignment = Aligned32 };
 };
 template <>
 struct unpacket_traits<Packet16q16i> {
   typedef QInt16 type;
   typedef Packet8q16i half;
-  enum { size = 16, alignment=Aligned32 };
+  enum { size = 16, alignment = Aligned32 };
 };
 template <>
 struct unpacket_traits<Packet32q8u> {
   typedef QUInt8 type;
   typedef Packet16q8u half;
-  enum { size = 32, alignment=Aligned32 };
+  enum { size = 32, alignment = Aligned32 };
 };
 template <>
 struct unpacket_traits<Packet8q32i> {
   typedef QInt32 type;
   typedef Packet4q32i half;
-  enum { size = 8, alignment=Aligned32 };
+  enum { size = 8, alignment = Aligned32 };
 };
 
 // Unaligned load
@@ -455,40 +451,47 @@ EIGEN_STRONG_INLINE QUInt8 predux_max<Packet32q8u>(const Packet32q8u& a) {
 template <>
 EIGEN_STRONG_INLINE QInt8 predux_min<Packet32q8i>(const Packet32q8i& a) {
   __m256i tmp = _mm256_min_epi8(a, _mm256_permute2f128_si256(a, a, 1));
-  tmp = _mm256_min_epi8(tmp, _mm256_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2)));
+  tmp =
+      _mm256_min_epi8(tmp, _mm256_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2)));
   tmp = _mm256_min_epi8(tmp, _mm256_shuffle_epi32(tmp, 1));
-  tmp = _mm256_min_epi8(tmp, _mm256_shufflelo_epi16(tmp, _MM_SHUFFLE(1, 0, 3, 2)));
+  tmp = _mm256_min_epi8(tmp,
+                        _mm256_shufflelo_epi16(tmp, _MM_SHUFFLE(1, 0, 3, 2)));
   return std::min(_mm256_extract_epi8_N0(tmp), _mm256_extract_epi8_N1(tmp));
 }
 template <>
 EIGEN_STRONG_INLINE QInt8 predux_max<Packet32q8i>(const Packet32q8i& a) {
   __m256i tmp = _mm256_max_epi8(a, _mm256_permute2f128_si256(a, a, 1));
-  tmp = _mm256_max_epi8(tmp, _mm256_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2)));
+  tmp =
+      _mm256_max_epi8(tmp, _mm256_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2)));
   tmp = _mm256_max_epi8(tmp, _mm256_shuffle_epi32(tmp, 1));
-  tmp = _mm256_max_epi8(tmp, _mm256_shufflelo_epi16(tmp, _MM_SHUFFLE(1, 0, 3, 2)));
+  tmp = _mm256_max_epi8(tmp,
+                        _mm256_shufflelo_epi16(tmp, _MM_SHUFFLE(1, 0, 3, 2)));
   return std::max(_mm256_extract_epi8_N0(tmp), _mm256_extract_epi8_N1(tmp));
 }
 
 // Vectorized scaling of Packet32q8i by float.
-template<>
+template <>
 struct scalar_product_op<QInt32, double> : binary_op_base<QInt32, double> {
   typedef typename ScalarBinaryOpTraits<QInt32, double>::ReturnType result_type;
 #ifndef EIGEN_SCALAR_BINARY_OP_PLUGIN
   EIGEN_EMPTY_STRUCT_CTOR(scalar_product_op)
 #else
-  scalar_product_op() {
-    EIGEN_SCALAR_BINARY_OP_PLUGIN
-  }
+  scalar_product_op() { EIGEN_SCALAR_BINARY_OP_PLUGIN }
 #endif
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator() (const QInt32& a, const double& b) const { return a * b; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type
+  operator()(const QInt32& a, const double& b) const {
+    return a * b;
+  }
 
-  EIGEN_STRONG_INLINE const Packet8q32i packetOp(const Packet8q32i& a, const double& b) const {
+  EIGEN_STRONG_INLINE const Packet8q32i packetOp(const Packet8q32i& a,
+                                                 const double& b) const {
     __m256d scale = _mm256_set1_pd(b);
     __m256d a_lo = _mm256_cvtepi32_pd(_mm256_castsi256_si128(a));
     __m128i result_lo = _mm256_cvtpd_epi32(_mm256_mul_pd(scale, a_lo));
     __m256d a_hi = _mm256_cvtepi32_pd(_mm256_extracti128_si256(a, 1));
     __m128i result_hi = _mm256_cvtpd_epi32(_mm256_mul_pd(scale, a_hi));
-    return _mm256_insertf128_si256(_mm256_castsi128_si256(result_lo), result_hi, 1);
+    return _mm256_insertf128_si256(_mm256_castsi128_si256(result_lo), result_hi,
+                                   1);
   }
 };
 
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX512.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX512.h
index 2092ce1d4c9..84750c1945a 100644
--- a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX512.h
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX512.h
@@ -127,25 +127,25 @@ template <>
 struct unpacket_traits<Packet64q8i> {
   typedef QInt8 type;
   typedef Packet32q8i half;
-  enum { size = 64, alignment=Aligned64 };
+  enum { size = 64, alignment = Aligned64 };
 };
 template <>
 struct unpacket_traits<Packet32q16i> {
   typedef QInt16 type;
   typedef Packet16q16i half;
-  enum { size = 32, alignment=Aligned64 };
+  enum { size = 32, alignment = Aligned64 };
 };
 template <>
 struct unpacket_traits<Packet64q8u> {
   typedef QUInt8 type;
   typedef Packet32q8u half;
-  enum { size = 64, alignment=Aligned64 };
+  enum { size = 64, alignment = Aligned64 };
 };
 template <>
 struct unpacket_traits<Packet16q32i> {
   typedef QInt32 type;
   typedef Packet8q32i half;
-  enum { size = 16, alignment=Aligned64 };
+  enum { size = 16, alignment = Aligned64 };
 };
 
 // Unaligned load
@@ -244,7 +244,7 @@ EIGEN_STRONG_INLINE QInt32 pfirst<Packet16q32i>(const Packet16q32i& a) {
 template <>
 EIGEN_STRONG_INLINE QUInt8 pfirst<Packet64q8u>(const Packet64q8u& a) {
   return static_cast<uint8_t>(
-           _mm_extract_epi8(_mm512_extracti32x4_epi32(a.val, 0), 0));
+      _mm_extract_epi8(_mm512_extracti32x4_epi32(a.val, 0), 0));
 }
 template <>
 EIGEN_STRONG_INLINE QInt8 pfirst<Packet64q8i>(const Packet64q8i& a) {
@@ -410,9 +410,7 @@ EIGEN_STRONG_INLINE QInt32 predux_min<Packet16q32i>(const Packet16q32i& a) {
       _mm_min_epi32(_mm_min_epi32(lane0, lane1), _mm_min_epi32(lane2, lane3));
   res = _mm_min_epi32(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2)));
   return pfirst(
-           _mm_min_epi32(
-             res,
-             _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1))));
+      _mm_min_epi32(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1))));
 }
 template <>
 EIGEN_STRONG_INLINE QInt32 predux_max<Packet16q32i>(const Packet16q32i& a) {
@@ -424,9 +422,7 @@ EIGEN_STRONG_INLINE QInt32 predux_max<Packet16q32i>(const Packet16q32i& a) {
       _mm_max_epi32(_mm_max_epi32(lane0, lane1), _mm_max_epi32(lane2, lane3));
   res = _mm_max_epi32(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2)));
   return pfirst(
-           _mm_max_epi32(
-             res,
-             _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1))));
+      _mm_max_epi32(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1))));
 }
 template <>
 EIGEN_STRONG_INLINE QInt16 predux_min<Packet32q16i>(const Packet32q16i& a) {
@@ -437,13 +433,10 @@ EIGEN_STRONG_INLINE QInt16 predux_min<Packet32q16i>(const Packet32q16i& a) {
   Packet4i res =
       _mm_min_epi16(_mm_min_epi16(lane0, lane1), _mm_min_epi16(lane2, lane3));
   res = _mm_min_epi16(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2)));
-  std::uint32_t w =
-      pfirst(
-        _mm_min_epi16(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1))));
-  return std::min({
-           static_cast<std::int16_t>(w >> 16),
-           static_cast<std::int16_t>(w)
-         });
+  std::uint32_t w = pfirst(
+      _mm_min_epi16(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1))));
+  return std::min(
+      {static_cast<std::int16_t>(w >> 16), static_cast<std::int16_t>(w)});
 }
 template <>
 EIGEN_STRONG_INLINE QInt16 predux_max<Packet32q16i>(const Packet32q16i& a) {
@@ -454,13 +447,10 @@ EIGEN_STRONG_INLINE QInt16 predux_max<Packet32q16i>(const Packet32q16i& a) {
   Packet4i res =
       _mm_max_epi16(_mm_max_epi16(lane0, lane1), _mm_max_epi16(lane2, lane3));
   res = _mm_max_epi16(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2)));
-  std::uint32_t w =
-      pfirst(
-        _mm_max_epi16(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1))));
-  return std::max({
-           static_cast<std::int16_t>(w >> 16),
-           static_cast<std::int16_t>(w)
-         });
+  std::uint32_t w = pfirst(
+      _mm_max_epi16(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1))));
+  return std::max(
+      {static_cast<std::int16_t>(w >> 16), static_cast<std::int16_t>(w)});
 }
 template <>
 EIGEN_STRONG_INLINE QUInt8 predux_min<Packet64q8u>(const Packet64q8u& a) {
@@ -471,15 +461,11 @@ EIGEN_STRONG_INLINE QUInt8 predux_min<Packet64q8u>(const Packet64q8u& a) {
   Packet4i res =
       _mm_min_epu8(_mm_min_epu8(lane0, lane1), _mm_min_epu8(lane2, lane3));
   res = _mm_min_epu8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2)));
-  std::uint32_t w =
-      pfirst(
-        _mm_min_epu8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1))));
-  return std::min({
-           static_cast<std::uint8_t>(w >> 24),
-           static_cast<std::uint8_t>(w >> 16),
-           static_cast<std::uint8_t>(w >> 8),
-           static_cast<std::uint8_t>(w)
-         });
+  std::uint32_t w = pfirst(
+      _mm_min_epu8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1))));
+  return std::min(
+      {static_cast<std::uint8_t>(w >> 24), static_cast<std::uint8_t>(w >> 16),
+       static_cast<std::uint8_t>(w >> 8), static_cast<std::uint8_t>(w)});
 }
 template <>
 EIGEN_STRONG_INLINE QUInt8 predux_max<Packet64q8u>(const Packet64q8u& a) {
@@ -490,15 +476,11 @@ EIGEN_STRONG_INLINE QUInt8 predux_max<Packet64q8u>(const Packet64q8u& a) {
   Packet4i res =
       _mm_max_epu8(_mm_max_epu8(lane0, lane1), _mm_max_epu8(lane2, lane3));
   res = _mm_max_epu8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2)));
-  std::uint32_t w =
-      pfirst(
-        _mm_max_epu8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1))));
-  return std::max({
-           static_cast<std::uint8_t>(w >> 24),
-           static_cast<std::uint8_t>(w >> 16),
-           static_cast<std::uint8_t>(w >> 8),
-           static_cast<std::uint8_t>(w)
-         });
+  std::uint32_t w = pfirst(
+      _mm_max_epu8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1))));
+  return std::max(
+      {static_cast<std::uint8_t>(w >> 24), static_cast<std::uint8_t>(w >> 16),
+       static_cast<std::uint8_t>(w >> 8), static_cast<std::uint8_t>(w)});
 }
 template <>
 EIGEN_STRONG_INLINE QInt8 predux_min<Packet64q8i>(const Packet64q8i& a) {
@@ -509,15 +491,11 @@ EIGEN_STRONG_INLINE QInt8 predux_min<Packet64q8i>(const Packet64q8i& a) {
   Packet4i res =
       _mm_min_epi8(_mm_min_epi8(lane0, lane1), _mm_min_epi8(lane2, lane3));
   res = _mm_min_epi8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2)));
-  std::uint32_t w =
-      pfirst(
-        _mm_min_epi8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1))));
-  return std::min({
-           static_cast<std::int8_t>(w >> 24),
-           static_cast<std::int8_t>(w >> 16),
-           static_cast<std::int8_t>(w >> 8),
-           static_cast<std::int8_t>(w)
-         });
+  std::uint32_t w = pfirst(
+      _mm_min_epi8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1))));
+  return std::min(
+      {static_cast<std::int8_t>(w >> 24), static_cast<std::int8_t>(w >> 16),
+       static_cast<std::int8_t>(w >> 8), static_cast<std::int8_t>(w)});
 }
 template <>
 EIGEN_STRONG_INLINE QInt8 predux_max<Packet64q8i>(const Packet64q8i& a) {
@@ -528,15 +506,11 @@ EIGEN_STRONG_INLINE QInt8 predux_max<Packet64q8i>(const Packet64q8i& a) {
   Packet4i res =
       _mm_max_epi8(_mm_max_epi8(lane0, lane1), _mm_max_epi8(lane2, lane3));
   res = _mm_max_epi8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2)));
-  std::uint32_t w =
-      pfirst(
-        _mm_max_epi8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1))));
-  return std::min({
-           static_cast<std::int8_t>(w >> 24),
-           static_cast<std::int8_t>(w >> 16),
-           static_cast<std::int8_t>(w >> 8),
-           static_cast<std::int8_t>(w)
-         });
+  std::uint32_t w = pfirst(
+      _mm_max_epi8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1))));
+  return std::min(
+      {static_cast<std::int8_t>(w >> 24), static_cast<std::int8_t>(w >> 16),
+       static_cast<std::int8_t>(w >> 8), static_cast<std::int8_t>(w)});
 }
 
 }  // end namespace internal
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/TypeCastingAVX512.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/TypeCastingAVX512.h
index a09eac67070..d3b02402971 100644
--- a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/TypeCastingAVX512.h
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/TypeCastingAVX512.h
@@ -33,28 +33,23 @@ struct type_casting_traits<float, QInt16> {
 };
 
 template <>
-EIGEN_STRONG_INLINE Packet32q16i
-pcast<Packet16f>(const Packet16f& a, const Packet16f& b) {
+EIGEN_STRONG_INLINE Packet32q16i pcast<Packet16f>(const Packet16f& a,
+                                                  const Packet16f& b) {
   Packet16i a_int = _mm512_cvtps_epi32(a);
   Packet16i b_int = _mm512_cvtps_epi32(b);
 #ifdef EIGEN_VECTORIZE_AVX512BW
   return _mm512_packs_epi32(a_int, b_int);
 #else
-  Packet8i ab_int16_low =
-      _mm256_permute4x64_epi64(
-        _mm256_packs_epi32(
-          _mm512_castsi512_si256(a_int),
-          _mm512_castsi512_si256(b_int)),
-        _MM_SHUFFLE(0, 2, 1, 3));
-  Packet8i ab_int16_high =
-      _mm256_permute4x64_epi64(
-        _mm256_packs_epi32(
-          _mm512_extracti32x8_epi32(a_int, 1),
-          _mm512_extracti32x8_epi32(b_int, 1)),
-        _MM_SHUFFLE(0, 2, 1, 3));
-  return _mm512_inserti32x8(
-           _mm512_castsi256_si512(ab_int16_low),
-           ab_int16_high, 1);
+  Packet8i ab_int16_low = _mm256_permute4x64_epi64(
+      _mm256_packs_epi32(_mm512_castsi512_si256(a_int),
+                         _mm512_castsi512_si256(b_int)),
+      _MM_SHUFFLE(0, 2, 1, 3));
+  Packet8i ab_int16_high = _mm256_permute4x64_epi64(
+      _mm256_packs_epi32(_mm512_extracti32x8_epi32(a_int, 1),
+                         _mm512_extracti32x8_epi32(b_int, 1)),
+      _MM_SHUFFLE(0, 2, 1, 3));
+  return _mm512_inserti32x8(_mm512_castsi256_si512(ab_int16_low), ab_int16_high,
+                            1);
 #endif
 }
 
@@ -64,55 +59,41 @@ struct type_casting_traits<float, QInt8> {
 };
 
 template <>
-EIGEN_STRONG_INLINE Packet64q8i
-pcast<Packet16f>(const Packet16f& a,
-                 const Packet16f& b,
-                 const Packet16f& c,
-                 const Packet16f& d) {
+EIGEN_STRONG_INLINE Packet64q8i pcast<Packet16f>(const Packet16f& a,
+                                                 const Packet16f& b,
+                                                 const Packet16f& c,
+                                                 const Packet16f& d) {
   Packet16i a_int = _mm512_cvtps_epi32(a);
   Packet16i b_int = _mm512_cvtps_epi32(b);
   Packet16i c_int = _mm512_cvtps_epi32(c);
   Packet16i d_int = _mm512_cvtps_epi32(d);
 #ifdef EIGEN_VECTORIZE_AVX512BW
-  return _mm512_packs_epi16(
-           _mm512_packs_epi32(a_int, b_int),
-           _mm512_packs_epi32(c_int, d_int));
+  return _mm512_packs_epi16(_mm512_packs_epi32(a_int, b_int),
+                            _mm512_packs_epi32(c_int, d_int));
 #else
-  Packet8i ab_int16_low =
-      _mm256_permute4x64_epi64(
-        _mm256_packs_epi32(
-          _mm512_castsi512_si256(a_int),
-          _mm512_castsi512_si256(b_int)),
-        _MM_SHUFFLE(0, 2, 1, 3));
-  Packet8i cd_int16_low =
-      _mm256_permute4x64_epi64(
-        _mm256_packs_epi32(
-          _mm512_castsi512_si256(c_int),
-          _mm512_castsi512_si256(d_int)),
-        _MM_SHUFFLE(0, 2, 1, 3));
-  Packet8i ab_int16_high =
-      _mm256_permute4x64_epi64(
-        _mm256_packs_epi32(
-          _mm512_extracti32x8_epi32(a_int, 1),
-          _mm512_extracti32x8_epi32(b_int, 1)),
-        _MM_SHUFFLE(0, 2, 1, 3));
-  Packet8i cd_int16_high =
-      _mm256_permute4x64_epi64(
-        _mm256_packs_epi32(
-          _mm512_extracti32x8_epi32(c_int, 1),
-          _mm512_extracti32x8_epi32(d_int, 1)),
-        _MM_SHUFFLE(0, 2, 1, 3));
-  Packet8i abcd_int8_low =
-      _mm256_permute4x64_epi64(
-        _mm256_packs_epi16(ab_int16_low, cd_int16_low),
-        _MM_SHUFFLE(0, 2, 1, 3));
+  Packet8i ab_int16_low = _mm256_permute4x64_epi64(
+      _mm256_packs_epi32(_mm512_castsi512_si256(a_int),
+                         _mm512_castsi512_si256(b_int)),
+      _MM_SHUFFLE(0, 2, 1, 3));
+  Packet8i cd_int16_low = _mm256_permute4x64_epi64(
+      _mm256_packs_epi32(_mm512_castsi512_si256(c_int),
+                         _mm512_castsi512_si256(d_int)),
+      _MM_SHUFFLE(0, 2, 1, 3));
+  Packet8i ab_int16_high = _mm256_permute4x64_epi64(
+      _mm256_packs_epi32(_mm512_extracti32x8_epi32(a_int, 1),
+                         _mm512_extracti32x8_epi32(b_int, 1)),
+      _MM_SHUFFLE(0, 2, 1, 3));
+  Packet8i cd_int16_high = _mm256_permute4x64_epi64(
+      _mm256_packs_epi32(_mm512_extracti32x8_epi32(c_int, 1),
+                         _mm512_extracti32x8_epi32(d_int, 1)),
+      _MM_SHUFFLE(0, 2, 1, 3));
+  Packet8i abcd_int8_low = _mm256_permute4x64_epi64(
+      _mm256_packs_epi16(ab_int16_low, cd_int16_low), _MM_SHUFFLE(0, 2, 1, 3));
   Packet8i abcd_int8_high =
-      _mm256_permute4x64_epi64(
-        _mm256_packs_epi16(ab_int16_high, cd_int16_high),
-        _MM_SHUFFLE(0, 2, 1, 3));
-  return _mm512_inserti32x8(
-           _mm512_castsi256_si512(abcd_int8_low),
-           abcd_int8_high, 1);
+      _mm256_permute4x64_epi64(_mm256_packs_epi16(ab_int16_high, cd_int16_high),
+                               _MM_SHUFFLE(0, 2, 1, 3));
+  return _mm512_inserti32x8(_mm512_castsi256_si512(abcd_int8_low),
+                            abcd_int8_high, 1);
 #endif
 }
 
@@ -128,10 +109,8 @@ struct type_casting_traits<QInt32, QInt16> {
 
 template <>
 EIGEN_STRONG_INLINE Packet64q8i
-pcast<Packet16q32i, Packet64q8i>(const Packet16q32i& a,
-                                 const Packet16q32i& b,
-                                 const Packet16q32i& c,
-                                 const Packet16q32i& d) {
+pcast<Packet16q32i, Packet64q8i>(const Packet16q32i& a, const Packet16q32i& b,
+                                 const Packet16q32i& c, const Packet16q32i& d) {
   __m128i a_part = _mm512_cvtsepi32_epi8(a);
   __m128i b_part = _mm512_cvtsepi32_epi8(b);
   __m128i c_part = _mm512_cvtsepi32_epi8(c);
@@ -145,9 +124,8 @@ pcast<Packet16q32i, Packet64q8i>(const Packet16q32i& a,
 }
 
 template <>
-EIGEN_STRONG_INLINE Packet32q16i
-pcast<Packet16q32i, Packet32q16i>(const Packet16q32i& a,
-                                  const Packet16q32i& b) {
+EIGEN_STRONG_INLINE Packet32q16i pcast<Packet16q32i, Packet32q16i>(
+    const Packet16q32i& a, const Packet16q32i& b) {
   __m256i a_part = _mm512_cvtsepi32_epi16(a);
   __m256i b_part = _mm512_cvtsepi32_epi16(b);
   __m512i converted =
diff --git a/third_party/eigen_reshaped.patch b/third_party/eigen_reshaped.patch
new file mode 100644
index 00000000000..7acfdcf9fef
--- /dev/null
+++ b/third_party/eigen_reshaped.patch
@@ -0,0 +1,48 @@
+--- a/Eigen/src/Core/util/ReshapedHelper.h	(date 1541195478000)
++++ b/Eigen/src/Core/util/ReshapedHelper.h	(date 1541195478000)
+@@ -39,6 +39,11 @@
+   return total/other;
+ }
+
++template<int Flags, int Order>
++struct get_compiletime_reshape_order {
++  enum { value = Order == AutoOrder ? Flags & RowMajorBit : Order };
++};
++
+ }
+
+ } // end namespace Eigen
+--- a/Eigen/src/plugins/ReshapedMethods.h	(date 1541195254000)
++++ b/Eigen/src/plugins/ReshapedMethods.h	(date 1541195254000)
+@@ -105,13 +105,13 @@
+ inline Reshaped<EIGEN_RESHAPED_METHOD_CONST Derived,
+                 internal::get_compiletime_reshape_size<NRowsType,NColsType,SizeAtCompileTime>::value,
+                 internal::get_compiletime_reshape_size<NColsType,NRowsType,SizeAtCompileTime>::value,
+-                (Order==AutoOrder?Flags&RowMajorBit:Order)>
++                internal::get_compiletime_reshape_order<Flags,Order>::value>
+ reshaped(NRowsType nRows, NColsType nCols) EIGEN_RESHAPED_METHOD_CONST
+ {
+   return Reshaped<EIGEN_RESHAPED_METHOD_CONST Derived,
+                   internal::get_compiletime_reshape_size<NRowsType,NColsType,SizeAtCompileTime>::value,
+                   internal::get_compiletime_reshape_size<NColsType,NRowsType,SizeAtCompileTime>::value,
+-                  (Order==AutoOrder?Flags&RowMajorBit:Order)>
++                  internal::get_compiletime_reshape_order<Flags,Order>::value>
+                 (derived(),
+                  internal::get_runtime_reshape_size(nRows,internal::get_runtime_value(nCols),size()),
+                  internal::get_runtime_reshape_size(nCols,internal::get_runtime_value(nRows),size()));
+@@ -128,11 +128,13 @@
+
+ template<int Order>
+ EIGEN_DEVICE_FUNC
+-inline Reshaped<EIGEN_RESHAPED_METHOD_CONST Derived, SizeAtCompileTime, 1, (Order==AutoOrder?Flags&RowMajorBit:Order)>
++inline Reshaped<EIGEN_RESHAPED_METHOD_CONST Derived, SizeAtCompileTime, 1,
++                internal::get_compiletime_reshape_order<Flags,Order>::value>
+ reshaped() EIGEN_RESHAPED_METHOD_CONST
+ {
+   EIGEN_STATIC_ASSERT(Order==RowMajor || Order==ColMajor || Order==AutoOrder, INVALID_TEMPLATE_PARAMETER);
+-  return Reshaped<EIGEN_RESHAPED_METHOD_CONST Derived, SizeAtCompileTime, 1, (Order==AutoOrder?Flags&RowMajorBit:Order)>
++  return Reshaped<EIGEN_RESHAPED_METHOD_CONST Derived, SizeAtCompileTime, 1,
++                  internal::get_compiletime_reshape_order<Flags,Order>::value>
+                 (derived(), size(), 1);
+ }
+ 
\ No newline at end of file
diff --git a/third_party/ngraph/ngraph.BUILD b/third_party/ngraph/ngraph.BUILD
index 6602a480afb..f556c5279df 100644
--- a/third_party/ngraph/ngraph.BUILD
+++ b/third_party/ngraph/ngraph.BUILD
@@ -34,8 +34,9 @@ cc_library(
         "src/ngraph/runtime/cpu/builder/one_hot.cpp",
         "src/ngraph/runtime/cpu/builder/pad.cpp",
         "src/ngraph/runtime/cpu/builder/product.cpp",
-        "src/ngraph/runtime/cpu/builder/quantize.cpp",
+        "src/ngraph/runtime/cpu/builder/quantization.cpp",
         "src/ngraph/runtime/cpu/builder/quantized_avg_pool.cpp",
+        "src/ngraph/runtime/cpu/builder/quantized_conv.cpp",
         "src/ngraph/runtime/cpu/builder/quantized_max_pool.cpp",
         "src/ngraph/runtime/cpu/builder/reduce_function.cpp",
         "src/ngraph/runtime/cpu/builder/reduce_function_window.cpp",
@@ -61,6 +62,7 @@ cc_library(
         "src/ngraph/runtime/cpu/cpu_tensor_view.cpp",
         "src/ngraph/runtime/cpu/cpu_tensor_view_wrapper.cpp",
         "src/ngraph/runtime/cpu/cpu_tracing.cpp",
+        "src/ngraph/runtime/cpu/cpu_visualize_tree.cpp",
         "src/ngraph/runtime/cpu/kernel/eigen_thread_pool.cpp",
         "src/ngraph/runtime/cpu/kernel/pad.cpp",
         "src/ngraph/runtime/cpu/kernel/reduce_max.cpp",
@@ -76,15 +78,11 @@ cc_library(
         "src/ngraph/runtime/cpu/op/conv_bias.cpp",
         "src/ngraph/runtime/cpu/op/conv_relu.cpp",
         "src/ngraph/runtime/cpu/op/convert_layout.cpp",
-        "src/ngraph/runtime/cpu/op/dequantize.cpp",
         "src/ngraph/runtime/cpu/op/group_conv.cpp",
         "src/ngraph/runtime/cpu/op/loop_kernel.cpp",
         "src/ngraph/runtime/cpu/op/lstm.cpp",
         "src/ngraph/runtime/cpu/op/matmul_bias.cpp",
         "src/ngraph/runtime/cpu/op/max_pool_with_indices.cpp",
-        "src/ngraph/runtime/cpu/op/quantize.cpp",
-        "src/ngraph/runtime/cpu/op/quantized_avg_pool.cpp",
-        "src/ngraph/runtime/cpu/op/quantized_max_pool.cpp",
         "src/ngraph/runtime/cpu/op/rnn.cpp",
         "src/ngraph/runtime/cpu/op/sigmoid_mul.cpp",
         "src/ngraph/runtime/cpu/pass/cpu_assignment.cpp",
@@ -110,8 +108,9 @@ cc_library(
         "-I external/ngraph/src",
         "-I external/nlohmann_json_lib/include/",
         '-D SHARED_LIB_EXT=\\".so\\"',
-        '-D NGRAPH_VERSION=\\"0.8.1\\"',
+        '-D NGRAPH_VERSION=\\"0.9.1\\"',
         "-D NGRAPH_DEX_ONLY",
+        '-D PROJECT_ROOT_DIR=\\"\\"',
     ],
     visibility = ["//visibility:public"],
     alwayslink = 1,
@@ -125,6 +124,11 @@ cc_library(
         "src/ngraph/builder/*.cpp",
         "src/ngraph/descriptor/*.cpp",
         "src/ngraph/descriptor/layout/*.cpp",
+        "src/ngraph/op/experimental/quantized_avg_pool.cpp",
+        "src/ngraph/op/experimental/quantized_conv_bias.cpp",
+        "src/ngraph/op/experimental/quantized_conv_relu.cpp",
+        "src/ngraph/op/experimental/quantized_conv.cpp",
+        "src/ngraph/op/experimental/quantized_max_pool.cpp",
         "src/ngraph/op/*.cpp",
         "src/ngraph/op/util/*.cpp",
         "src/ngraph/pattern/*.cpp",
@@ -144,7 +148,8 @@ cc_library(
         "-I external/ngraph/src",
         "-I external/nlohmann_json_lib/include/",
         '-D SHARED_LIB_EXT=\\".so\\"',
-        '-D NGRAPH_VERSION=\\"0.8.1\\"',
+        '-D NGRAPH_VERSION=\\"0.9.1\\"',
+        '-D PROJECT_ROOT_DIR=\\"\\"',
     ],
     visibility = ["//visibility:public"],
     alwayslink = 1,
diff --git a/third_party/ngraph/ngraph_tf.BUILD b/third_party/ngraph/ngraph_tf.BUILD
index dbedca0a03c..068e411e81b 100644
--- a/third_party/ngraph/ngraph_tf.BUILD
+++ b/third_party/ngraph/ngraph_tf.BUILD
@@ -51,6 +51,9 @@ cc_library(
         "@org_tensorflow//tensorflow/core:framework_headers_lib",
         "@org_tensorflow//tensorflow/core:core_cpu_headers_lib",
         "@ngraph//:ngraph_core",
+        "@com_google_absl//absl/container:container_memory",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/types:variant",
     ],
     copts = [
         "-I external/ngraph_tf/src",
diff --git a/third_party/toolchains/BUILD b/third_party/toolchains/BUILD
index 6e1416ced16..a7b4687c020 100644
--- a/third_party/toolchains/BUILD
+++ b/third_party/toolchains/BUILD
@@ -2,6 +2,8 @@ licenses(["restricted"])
 
 package(default_visibility = ["//visibility:public"])
 
+load("//third_party/toolchains/preconfig/generate:containers.bzl", "container_digests")
+
 # Platform for use with remote execution with
 # custom container based off RBE Ubuntu16_04
 # http://gcr.io/cloud-marketplace/google/rbe-ubuntu16-04
@@ -30,6 +32,6 @@ platform(
     remote_execution_properties = """
         properties: {
             name: "container-image"
-            value:"docker://gcr.io/asci-toolchain/nosla-cuda9.0-cudnn7-ubuntu14.04@sha256:e5099ff15650986e268a43ee99e2d2b7ffe2459b8b6935385078d1d3b2ed4d02"
-        }""",
+            value:"docker://gcr.io/asci-toolchain/nosla-cuda9.0-cudnn7-ubuntu14.04@%s"
+        }""" % container_digests["cuda9.0-cudnn7-ubuntu14.04"],
 )
diff --git a/third_party/toolchains/preconfig/generate/BUILD b/third_party/toolchains/preconfig/generate/BUILD
new file mode 100644
index 00000000000..7e3e93d6004
--- /dev/null
+++ b/third_party/toolchains/preconfig/generate/BUILD
@@ -0,0 +1,35 @@
+licenses(["restricted"])
+
+load(":generate.bzl", "tensorflow_rbe_config")
+
+tensorflow_rbe_config(
+    name = "ubuntu14.04-py3-gcc-cuda9.0-cudnn7-nccl2",
+    compiler = "gcc",
+    cuda_version = "9.0",
+    cudnn_version = "7",
+    python_version = "3",
+)
+
+tensorflow_rbe_config(
+    name = "ubuntu14.04-py3-clang-cuda9.0-cudnn7-nccl2",
+    compiler = "clang",
+    cuda_version = "9.0",
+    cudnn_version = "7",
+    python_version = "3",
+)
+
+tensorflow_rbe_config(
+    name = "ubuntu14.04-py3-gcc-cuda10.0-cudnn7-nccl2",
+    compiler = "gcc",
+    cuda_version = "10.0",
+    cudnn_version = "7",
+    python_version = "3",
+)
+
+tensorflow_rbe_config(
+    name = "ubuntu14.04-py3-clang-cuda10.0-cudnn7-nccl2",
+    compiler = "clang",
+    cuda_version = "10.0",
+    cudnn_version = "7",
+    python_version = "3",
+)
diff --git a/third_party/toolchains/preconfig/generate/containers.bzl b/third_party/toolchains/preconfig/generate/containers.bzl
new file mode 100644
index 00000000000..1f9e29d4402
--- /dev/null
+++ b/third_party/toolchains/preconfig/generate/containers.bzl
@@ -0,0 +1,4 @@
+container_digests = {
+    "cuda9.0-cudnn7-ubuntu14.04": "sha256:c26138f4c38c754da2bad44a8a068523abf7fbd71d58a57ce92e5342c5431bf5",
+    "cuda10.0-cudnn7-ubuntu14.04": "sha256:34c4a55e2376b300cdc2b903775fc32e62352f6e33f927df5653743324378bfc",
+}
diff --git a/third_party/toolchains/preconfig/generate/generate.bzl b/third_party/toolchains/preconfig/generate/generate.bzl
new file mode 100644
index 00000000000..2fb3a94cdca
--- /dev/null
+++ b/third_party/toolchains/preconfig/generate/generate.bzl
@@ -0,0 +1,46 @@
+load(
+    "@bazel_toolchains//rules:docker_config.bzl",
+    "docker_toolchain_autoconfig",
+)
+
+def _tensorflow_rbe_config(name, cuda_version, cudnn_version, python_version, compiler):
+    docker_toolchain_autoconfig(
+        name = name,
+        base = "@cuda%s-cudnn%s-ubuntu14.04//image" % (cuda_version, cudnn_version),
+        bazel_version = "0.16.1",
+        config_repos = [
+            "local_config_cuda",
+            "local_config_python",
+            "local_config_nccl",
+        ],
+        env = {
+            "ABI_VERSION": "gcc",
+            "ABI_LIBC_VERSION": "glibc_2.19",
+            "BAZEL_COMPILER": compiler,
+            "BAZEL_HOST_SYSTEM": "i686-unknown-linux-gnu",
+            "BAZEL_TARGET_LIBC": "glibc_2.19",
+            "BAZEL_TARGET_CPU": "k8",
+            "BAZEL_TARGET_SYSTEM": "x86_64-unknown-linux-gnu",
+            "CC_TOOLCHAIN_NAME": "linux_gnu_x86",
+            "CC": compiler,
+            "PYTHON_BIN_PATH": "/usr/bin/python%s" % python_version,
+            "TF_NEED_CUDA": "1",
+            "TF_CUDA_CLANG": "1" if compiler == "clang" else "0",
+            "CLEAR_CACHE": "1",
+            "TF_CUDA_COMPUTE_CAPABILITIES": "3.0",
+            "TF_ENABLE_XLA": "1",
+            "TF_CUDNN_VERSION": cudnn_version,
+            "TF_CUDA_VERSION": cuda_version,
+            "NCCL_INSTALL_PATH": "/usr/lib",
+            "NCCL_HDR_PATH": "/usr/include",
+            "TF_NCCL_VERSION": "2",
+            "CUDNN_INSTALL_PATH": "/usr/lib/x86_64-linux-gnu",
+        },
+        # TODO(klimek): We should use the sources that we currently work on, not
+        # just the latest snapshot of tensorflow that is checked in.
+        git_repo = "https://github.com/tensorflow/tensorflow",
+        tags = ["manual"],
+        incompatible_changes_off = True,
+    )
+
+tensorflow_rbe_config = _tensorflow_rbe_config
diff --git a/third_party/toolchains/preconfig/generate/generate.sh b/third_party/toolchains/preconfig/generate/generate.sh
new file mode 100755
index 00000000000..37c5211278a
--- /dev/null
+++ b/third_party/toolchains/preconfig/generate/generate.sh
@@ -0,0 +1,85 @@
+#!/bin/bash
+#
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+TARGET="$1"
+OUTPUT="$2"
+
+if [[ -z "${TARGET}" || -z "${OUTPUT}" ]]; then
+  echo "Usage:"
+  echo "$0 <target> <output>"
+  exit 1
+fi
+
+TEMPDIR="$(mktemp -d)"
+ROOT="${PWD}"
+PKG="third_party/toolchains/preconfig"
+IFS='-' read -ra PLATFORM <<< "${TARGET}"
+OS="${PLATFORM[0]}"
+PY_VERSION="${PLATFORM[1]}"
+COMPILER="${PLATFORM[2]}"
+CUDA_VERSION="${PLATFORM[3]}"
+CUDNN_VERSION="${PLATFORM[4]}"
+NCCL_VERSION="${PLATFORM[5]}"
+
+if [[ "${COMPILER}" == "gcc" ]]; then
+  COMPILER="gcc-nvcc-${CUDA_VERSION}"
+fi
+
+echo "OS: ${OS}"
+echo "Python: ${PY_VERSION}"
+echo "Compiler: ${COMPILER}"
+echo "CUDA: ${CUDA_VERSION}"
+echo "CUDNN: ${CUDNN_VERSION}"
+echo "NCCL: ${NCCL_VERSION}"
+
+bazel build "${PKG}/generate:${TARGET}"
+cd "${TEMPDIR}"
+tar xvf "${ROOT}/bazel-bin/${PKG}/generate/${TARGET}_outputs.tar"
+
+# Delete all empty files: configurations leave empty files around when they are
+# unnecessary.
+find . -empty -delete
+
+# We build up the following directory structure with preconfigured packages:
+# <OS>/
+#   <CUDA>-<CUDNN>/
+#   <COMPILER>/
+#   <NCCL>/
+#   <PYTHON>/
+
+# Create our toplevel output directory for the OS.
+mkdir "${OS}"
+
+# Python:
+mv local_config_python "${OS}/${PY_VERSION}"
+
+# NCCL:
+mv local_config_nccl "${OS}/${NCCL_VERSION}"
+
+# Compiler:
+mv local_config_cuda/crosstool "${OS}/${COMPILER}"
+
+# CUDA:
+mv local_config_cuda "${OS}/${CUDA_VERSION}-${CUDNN_VERSION}"
+
+# Cleanup for copybara.
+find "${OS}" -name 'BUILD' -o -name '*.bzl' |xargs buildifier
+find "${OS}" -name 'BUILD' -o -name '*.bzl' |xargs -I {} mv {} {}.oss
+
+# Tar it up:
+tar cvf "${OUTPUT}" "${OS}"
+
diff --git a/third_party/toolchains/preconfig/generate/workspace.bzl b/third_party/toolchains/preconfig/generate/workspace.bzl
new file mode 100644
index 00000000000..f30c2f1ae63
--- /dev/null
+++ b/third_party/toolchains/preconfig/generate/workspace.bzl
@@ -0,0 +1,25 @@
+load(
+    "@io_bazel_rules_docker//container:container.bzl",
+    "container_pull",
+    container_repositories = "repositories",
+)
+load(":containers.bzl", "container_digests")
+
+def _remote_config_workspace():
+    container_repositories()
+
+    container_pull(
+        name = "cuda9.0-cudnn7-ubuntu14.04",
+        registry = "gcr.io",
+        repository = "asci-toolchain/nosla-cuda9.0-cudnn7-ubuntu14.04",
+        digest = container_digests["cuda9.0-cudnn7-ubuntu14.04"],
+    )
+
+    container_pull(
+        name = "cuda10.0-cudnn7-ubuntu14.04",
+        registry = "gcr.io",
+        repository = "asci-toolchain/nosla-cuda10.0-cudnn7-ubuntu14.04",
+        digest = container_digests["cuda10.0-cudnn7-ubuntu14.04"],
+    )
+
+remote_config_workspace = _remote_config_workspace